## ISB Assignment
### - By Pranshu Bansal

In [1]:
# Importing Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os

In [2]:
# Function to scrap the table data from the Beautiful Soup object and build and save the data into a CSV file
def scrap_data(table, category, day_path, date):
    
    # Creating lists to store the entries of the table
    position = []
    names = []
    teams = []
    ratings = []
    best_ratings = []
    dates = []

    # First row of the table is in banner form on the website, so it has to be treated seperately
    pos = 1
    # Defining the table row that has the entry of the first player
    first_row = table.find('tr', class_ = "rankings-block__banner")     
    
    # Finding and storing all the table datas from the first row that has the information about the player (like name, team, rating, best_rating)
    name = first_row.find('td', class_ = "rankings-block__top-player-container").text.strip()
    team = first_row.find_all('td')[2].text.strip()
    rating = first_row.find('td', class_ = "u-text-left").text.strip()
    best_rating = first_row.find('td', class_ = "u-text-right u-hide-phablet u-overflow-hidden").text.strip()

    # Adding the first player's information in the lists
    position.append(pos)
    names.append(name)
    teams.append(team)
    ratings.append(rating)
    best_ratings.append(best_rating)
    dates.append(date)
    
    # Finding and defining all the rows that has information about all the rest 99 players
    rows = table.find_all('tr', class_ = "table-body")

    # Iterating over all the player rows
    for row in rows:
        # Finding and storing all the table datas from each row that has the information about the player (like name, team, rating, best_rating)
        pos += 1
        name = row.find('td', class_ = "table-body__cell rankings-table__name name").text.strip()
        team = row.find('td', class_ = "table-body__cell nationality-logo rankings-table__team").text.strip()
        rating = row.find('td', class_ = "table-body__cell rating").text.strip()
        best_rating = row.find('td', class_ = "table-body__cell u-text-right u-hide-phablet").text.strip()

        # Adding each player's information in the lists
        position.append(pos)
        names.append(name)
        teams.append(team)
        ratings.append(rating)
        best_ratings.append(best_rating)
        dates.append(date)

    # Creating a dictionary that has the key as the name of the column and value as the players' information lists
    dict = {
        'POS' : position,
        'NAME' : names,
        'TEAM' : teams,
        'RATING' : ratings,
        'CAREER BEST RATING' : best_ratings,
        'DATE' : dates
    }
    
    # Converting the dictionary into pandas DataFrame that can be stored in a csv file
    df = pd.DataFrame(dict)
    
    # Storing the DataFrame into csv file. Defining the path where the csv file would be stored with the name as category of the player (batting, bowling, all_rounder)
    df.to_csv(day_path + '/' + category + '.csv', index=False)

In [3]:
# Function to iterate over all the days, making folder structure, making requests from the particular website to find and build ranking table
def build_data():
    
    # Defining the start and end dates which signifies the timeline through which we need to iterate over
    start_date = datetime.date(1971, 2, 1)     # Filter in the website starts from February 1, 1971, so making the start_date as such
    end_date = datetime.date(2022, 1, 4)

    # update is the stride we need to take while iterating. Here, update is set as 1 day which means iteration would occur day by day
    update = datetime.timedelta(days = 1)
    
    # Defining mon dictionary that has key as month number and value as the month name. Used for naming the folder with the month name
    mon = {1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}

    # Making a folder named 'ODI Data'
#     os.mkdir('ODI Data')
    
    # Defining the base urls for the batting, bowling and all-rounder websites. It could be joined with any date to get the player rankings of that particular date
    base_url_bat = 'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at='
    base_url_ball = 'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at='
    base_url_all = 'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at='

    # Making a previous date that could be used as pointer to make new folders
    prev_date = start_date - update
    
    # Making a bool check that is used to make the first year's folder (1971)
    first_year = True
    
    # Iterate over each date from start_date to end_date
    while start_date <= end_date:
        
        # To make the year folder whenever the start_date year is not equal to prev_date year (it means the year has changed from the prev) or if it is the first year
        if first_year==True or start_date.year != prev_date.year: 
            year_path = os.path.join('ODI Data/', str(start_date.year))
            os.mkdir(year_path)
            first_year = False
        
        # To make the month folder inside the year folder when the start_date month is not equal to prev_date month. Month name would be the name of the folder
        if start_date.month != prev_date.month:
            mon_path = os.path.join(year_path, str(start_date.month) + '. ' + str(mon[start_date.month]))
            os.mkdir(mon_path)
            
        # To change the name of the day if it is single digit in string format(Eg: 5 is converted into 05)
        if start_date.day < 10:
            d = '0'+ str(start_date.day)
        else:
            d = str(start_date.day)
        
        # To change the name of the month if it is single digit in string format(Eg: 4 is converted into 04)
        if start_date.month < 10:
            m = '0' + str(start_date.month)
        else:
            m = str(start_date.month)
        
        # To make the day folder inside the month folder with the name in format - 'DDMMYYY'
        day_path = os.path.join(mon_path, d + m + str(start_date.year))
        os.mkdir(day_path)
        
        # _____________________BATTING_______________________________
        
        # Make the complete URL with the base URL and combining it with the date
        url_bat = base_url_bat + str(start_date)
        
        # Make a GET Request to the URL 
        bat = requests.get(url_bat)
        
        # Creating the Beautiful Soup Object
        soup = BeautifulSoup(bat.text)
        
        # Finding the ranking table from the soup object
        bat_table = soup.find('table', class_ = 'table rankings-table')
        
        # Calling the scrap_data function. Giving it the batting table, category as batting, path where batting.csv would be stored and current date as parameters.
        scrap_data(bat_table, "batting", day_path, str(start_date))
        
        
        # _____________________BALLING_________________________________
        
        # Make the complete URL with the base URL and combining it with the date
        url_ball = base_url_ball + str(start_date)
        
        # Make a GET Request to the URL
        ball = requests.get(url_ball)
        
        # Creating the Beautiful Soup Object
        soup = BeautifulSoup(ball.text)
        
        # Finding the ranking table from the soup object
        ball_table = soup.find('table', class_ = 'table rankings-table')
        
        # Calling the scrap_data function. Giving it the balling table, category as balling, path where balling.csv would be stored and current date as parameters.
        scrap_data(ball_table, "balling", day_path, str(start_date))
        
        
        # ____________________ALL ROUNDER_________________________________
        
        # Make the complete URL with the base URL and combining it with the date
        url_all = base_url_all + str(start_date)
        
        # Make a GET Request to the URL
        all_r = requests.get(url_all)
        
        # Creating the Beautiful Soup Object
        soup = BeautifulSoup(all_r.text)
        
        # Finding the ranking table from the soup object
        all_table = soup.find('table', class_ = 'table rankings-table')
        
        # Calling the scrap_data function. Giving it the all_rounder table, category as all_rounder, path where all_rounder.csv would be stored and current date as parameters.
        scrap_data(all_table, "all_rounder", day_path, str(start_date))

        
        # Updating previous date as start date (current date)
        prev_date = start_date
        # Incrementing start_date by a day
        start_date += update
        
        

In [4]:
# Calling the build_data function
build_data()