# Scraping CricMetric for the Player Role and Country Information

This code performs web scraping to gather country and role information for a list of cricket players. It utilizes the `requests` library to retrieve web page content, the `BeautifulSoup` library to parse the HTML, and the `pandas` library to manipulate and store the extracted data. The list of player names is loaded from a pickle file, and then the `country_and_role` function is applied to each player to obtain their country and role information. The results are concatenated into a single dataframe, sorted, and saved as a CSV file. The generated CSV file can be used for further analysis and data wrangling in the notebook `1-DataWrangling.ipynb`.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

In [3]:
international_list = ['Afghanistan', 'Australia', 'Bangladesh', 'England', 'India', 'Ireland', 'New Zealand', 'Pakistan', 'South Africa', 'Sri Lanka', 'West Indies', 'Zimbabwe', 'Nepal', 'Netherlands', 'Scotland']

In [None]:
def country_and_role(name):
    # Replace spaces in the input name with '+'
    input_name = name.replace(' ', '+')

    # Construct the URL for the player's stats page
    url = f'http://www.cricmetric.com/playerstats.py?player={input_name}&role=all&format=all&groupby=year'
    
    # Send a GET request to retrieve the web page content
    response = requests.get(url)

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the element containing 'Teams played for'
    country_element = soup.find('b', string='Teams played for')

    if country_element:
        # Find the next sibling element after 'Teams played for'
        player_country_element = country_element.find_next_sibling('a')
        
        if player_country_element:
            # Extract the country text from the sibling element
            player_country = player_country_element.text.strip()

            # Check if the country is in the international list
            if player_country not in international_list:
                player_country = 'Uncapped'
    else:
        player_country = 'Uncapped'

    # Find the element containing 'Role'
    role_element = soup.find('b', string='Role')

    if role_element:
        # Extract the text next to 'Role' element
        role_text = role_element.next_sibling.strip(': ')
    else:
        role_text = ''

    # Create a DataFrame with the player's name, role, and country
    country_and_role_df = pd.DataFrame({'Player': [name], 'Role': [role_text], 'Country': [player_country]})
    
    return country_and_role_df

In [None]:
# Load the list from the file
pkl_path = './ipl-player-salary/new_player_list.pkl'
with open(pkl_path, 'rb') as f:
    new_player_list = pickle.load(f)

In [None]:
player_country_role_df_list = []
for player in new_player_list:
    player_country_role_df_list.append(country_and_role(player))

In [None]:
# Concatenate the list of dataframes into a single dataframe
player_country_role_df = pd.concat(player_country_role_df_list)

# Reset the index of the combined dataframe
player_country_role_df = player_country_role_df.reset_index(drop=True)

player_country_role_df = player_country_role_df.sort_values(['Country', 'Role'])

# Save the dataframe for use in the 1-DataWrangling notebook
# define a filepath
filepath = './ipl-player-salary/player_country_role.txt'
player_country_role_df.to_csv(filepath, index=False)