In [1]:
import mysql.connector
import pandas as pd
import matplotlib as plt
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Set pandas display options to show more rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [2]:
# Connect to the MySQL database
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="123456",
    database="cricket_data"
)

# Define the query to select all columns from the matches table
query = """SELECT matches.match_id, matches.gender, matches.start_date, matches.teams_type,
matches.match_type, matches.team_involved_one, matches.team_involved_two, info_section.outcome,
info_section.toss, info_section.city, info_section.venue
FROM matches JOIN
info_section ON
matches.match_id = info_section.match_id;"""

# Execute the query and fetch the results
cursor = connection.cursor()
cursor.execute(query)
matches_data = cursor.fetchall()

# Close the cursor and connection
cursor.close()
connection.close()

# Convert the fetched data to a pandas DataFrame
matches_df = pd.DataFrame(matches_data, columns=[desc[0] for desc in cursor.description])

TypeError: 'NoneType' object is not iterable

In [3]:
matches_df.head()

NameError: name 'matches_df' is not defined

In [4]:
def transform_dataframe(df):
    # 1. Change the start_date to datetime
    df['start_date'] = pd.to_datetime(df['start_date'])

    # 2. Split the toss field into toss_winner and toss_decision
    def extract_toss_details(toss):
        toss_data = json.loads(toss)  # Parse the JSON string to a dictionary
        return toss_data.get('winner'), toss_data.get('decision')
    
    df[['toss_winner', 'toss_decision']] = df['toss'].apply(lambda x: pd.Series(extract_toss_details(x)))

    # Drop the original toss column
    df.drop(columns=['toss'], inplace=True)

    # 3. Remove double quotes from the city and venue field
    df['city'] = df['city'].str.strip('"')
    df['venue'] = df['venue'].str.strip('"')

    return df



In [5]:
matches_df = transform_dataframe(matches_df)

In [6]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,outcome,city,venue,toss_winner,toss_decision
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,"{""by"": {""runs"": 177}, ""winner"": ""South Africa""}",Perth,Western Australia Cricket Association Ground,South Africa,bat
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,"{""by"": {""innings"": 1, ""runs"": 80}, ""winner"": ""...",Hobart,Bellerive Oval,South Africa,field
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"{""by"": {""wickets"": 7}, ""winner"": ""Australia""}",,Adelaide Oval,South Africa,bat
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,"{""by"": {""runs"": 39}, ""winner"": ""Australia""}",Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,"{""by"": {""innings"": 1, ""runs"": 18}, ""winner"": ""...",,Melbourne Cricket Ground,Pakistan,bat


In [7]:
def process_outcome(outcome):
    # Initialize default values
    result = winner = runs = innings = wickets = method = None

    # Parse the JSON string to a dictionary
    outcome_data = json.loads(outcome)

    # Extract the common fields
    if 'result' in outcome_data:
        result = outcome_data['result']
    if 'winner' in outcome_data:
        winner = outcome_data['winner']
    if 'method' in outcome_data:
        method = outcome_data['method']
    if 'by' in outcome_data:
        by = outcome_data['by']
        if 'runs' in by:
            runs = by['runs']
        if 'innings' in by:
            innings = by['innings']
        if 'wickets' in by:
            wickets = by['wickets']

    return result, winner, runs, innings, wickets, method

# Apply the function to the outcome column and create new columns
matches_df[['result', 'winner', 'runs', 'innings', 'wickets', 'method']] = matches_df['outcome'].apply(
    lambda x: pd.Series(process_outcome(x))
)

# Drop the original outcome column
matches_df.drop(columns=['outcome'], inplace=True)

In [8]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,city,venue,toss_winner,toss_decision,result,winner,runs,innings,wickets,method
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,Perth,Western Australia Cricket Association Ground,South Africa,bat,,South Africa,177.0,,,
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,Hobart,Bellerive Oval,South Africa,field,,South Africa,80.0,1.0,,
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,,Adelaide Oval,South Africa,bat,,Australia,,,7.0,
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,,Australia,39.0,,,
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,,Melbourne Cricket Ground,Pakistan,bat,,Australia,18.0,1.0,,


In [9]:
def matches_per_team(teams_type):
    # Filter matches based on the provided team type
    filtered_matches = matches_df[matches_df['teams_type'] == teams_type]
    
    # Create a list of all teams involved in the filtered matches
    all_teams = pd.concat([filtered_matches['team_involved_one'], filtered_matches['team_involved_two']]).unique()
    
    # Initialize a dictionary to store the count of matches per team
    matches_per_team = {team: 0 for team in all_teams}
    
    # Update the matches count for each team
    for team in all_teams:
        matches_per_team[team] = ((filtered_matches['team_involved_one'] == team) | (filtered_matches['team_involved_two'] == team)).sum()
    
    # Sort the dictionary items by value in descending order
    matches_per_team_sorted = dict(sorted(matches_per_team.items(), key=lambda item: item[1], reverse=True))
    
    return matches_per_team_sorted

In [10]:
club_match_per_team = matches_per_team("club")
print(club_match_per_team)

{'Hampshire': 362, 'Nottinghamshire': 360, 'Somerset': 356, 'Essex': 355, 'Surrey': 354, 'Kent': 353, 'Worcestershire': 352, 'Lancashire': 351, 'Durham': 349, 'Gloucestershire': 348, 'Yorkshire': 347, 'Middlesex': 346, 'Northamptonshire': 345, 'Sussex': 345, 'Leicestershire': 342, 'Glamorgan': 339, 'Derbyshire': 337, 'Mumbai Indians': 280, 'Warwickshire': 274, 'Perth Scorchers': 265, 'Sydney Sixers': 261, 'Adelaide Strikers': 258, 'Brisbane Heat': 258, 'Royal Challengers Bangalore': 258, 'Melbourne Stars': 254, 'Kolkata Knight Riders': 251, 'Melbourne Renegades': 243, 'Hobart Hurricanes': 242, 'Sydney Thunder': 241, 'Chennai Super Kings': 238, 'Rajasthan Royals': 221, 'Canterbury': 203, 'Northern Districts': 199, 'Kings XI Punjab': 190, 'Sunrisers Hyderabad': 182, 'Delhi Daredevils': 161, 'Guyana Amazon Warriors': 127, 'Jamaica Tallawahs': 117, 'Otago': 117, 'Wellington': 115, 'Central Districts': 114, 'Auckland': 113, 'Southern Vipers': 111, 'Delhi Capitals': 109, 'Peshawar Zalmi': 10

In [11]:
def matches_per_team(teams_type, matches_df):
    # Filter matches based on the provided team type
    filtered_matches = matches_df[matches_df['teams_type'] == teams_type]
    
    # Create a list of all teams involved in the filtered matches
    all_teams = pd.concat([filtered_matches['team_involved_one'], filtered_matches['team_involved_two']]).unique()
    
    # Initialize a dictionary to store the count of matches per team
    matches_per_team = {team: 0 for team in all_teams}
    
    # Update the matches count for each team
    for team in all_teams:
        matches_per_team[team] = ((filtered_matches['team_involved_one'] == team) | (filtered_matches['team_involved_two'] == team)).sum()
    
    # Sort the dictionary items by value in descending order
    matches_per_team_sorted = dict(sorted(matches_per_team.items(), key=lambda item: item[1], reverse=True))
    
    return matches_per_team_sorted

# Generate the list of club names dynamically
clubs = list(matches_per_team("club", matches_df).keys())
print(clubs)

['Hampshire', 'Nottinghamshire', 'Somerset', 'Essex', 'Surrey', 'Kent', 'Worcestershire', 'Lancashire', 'Durham', 'Gloucestershire', 'Yorkshire', 'Middlesex', 'Northamptonshire', 'Sussex', 'Leicestershire', 'Glamorgan', 'Derbyshire', 'Mumbai Indians', 'Warwickshire', 'Perth Scorchers', 'Sydney Sixers', 'Adelaide Strikers', 'Brisbane Heat', 'Royal Challengers Bangalore', 'Melbourne Stars', 'Kolkata Knight Riders', 'Melbourne Renegades', 'Hobart Hurricanes', 'Sydney Thunder', 'Chennai Super Kings', 'Rajasthan Royals', 'Canterbury', 'Northern Districts', 'Kings XI Punjab', 'Sunrisers Hyderabad', 'Delhi Daredevils', 'Guyana Amazon Warriors', 'Jamaica Tallawahs', 'Otago', 'Wellington', 'Central Districts', 'Auckland', 'Southern Vipers', 'Delhi Capitals', 'Peshawar Zalmi', 'Western Storm', 'Trinbago Knight Riders', 'Islamabad United', 'Titans', 'Warriors', 'Lahore Qalandars', 'Karachi Kings', 'Quetta Gladiators', 'St Kitts and Nevis Patriots', 'Comilla Victorians', 'Dolphins', 'Lions', 'Rang

In [12]:
from fuzzywuzzy import process

def find_potential_matches(clubs, threshold=85):
    potential_matches = {}
    for club in clubs:
        matches = process.extract(club, clubs, limit=None)
        potential_matches[club] = [match[0] for match in matches if match[1] >= threshold and match[0] != club]
    return potential_matches

potential_matches = find_potential_matches(clubs)
for club, matches in potential_matches.items():
    if matches:
        print(f"{club}: {matches}")

Hampshire: ['Nottinghamshire']
Nottinghamshire: ['Hampshire']
Surrey: ['Surrey Stars']
Lancashire: ['Lancashire Thunder']
Yorkshire: ['Yorkshire Diamonds']
Mumbai Indians: ['Mumbai']
Royal Challengers Bangalore: ['Royal Challengers Bengaluru']
Melbourne Stars: ['Jozi Stars']
Kolkata Knight Riders: ['Rangpur Riders']
Sydney Thunder: ['Thunder']
Chennai Super Kings: ['Punjab Kings', 'Jaffna Kings']
Rajasthan Royals: ['Rajasthan']
Kings XI Punjab: ['Punjab Kings', 'Punjab']
Sunrisers Hyderabad: ['Sunrisers']
Delhi Daredevils: ['Delhi']
Guyana Amazon Warriors: ['Warriors', 'Guyana', 'Pune Warriors', 'Kandy Warriors']
Jamaica Tallawahs: ['Jamaica']
Otago: ['Otago Volts']
Wellington: ['Wellington Firebirds']
Auckland: ['Auckland Aces']
Delhi Capitals: ['Delhi']
Trinbago Knight Riders: ['Rangpur Riders']
Titans: ['Gujarat Titans', 'Khulna Titans', 'Galle Titans']
Warriors: ['Guyana Amazon Warriors', 'North-West Warriors', 'Pune Warriors', 'Sharjah Warriors', 'Cumilla Warriors', 'North West Wa

In [13]:
# Connect to the MySQL database
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="123456",
    database="cricket_data"
)

# Define the query to select all columns from the matches table
query = """SELECT matches.match_id, matches.gender, matches.start_date, matches.teams_type,
matches.match_type, matches.team_involved_one, matches.team_involved_two, info_section.outcome,
info_section.toss, info_section.city, info_section.venue
FROM matches JOIN
info_section ON
matches.match_id = info_section.match_id;"""

# Execute the query and fetch the results
cursor = connection.cursor()
cursor.execute(query)
matches_data = cursor.fetchall()

# Close the cursor and connection
cursor.close()
connection.close()

# Convert the fetched data to a pandas DataFrame
matches_df = pd.DataFrame(matches_data, columns=[desc[0] for desc in cursor.description])

In [14]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,outcome,toss,city,venue
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,"{""by"": {""runs"": 177}, ""winner"": ""South Africa""}","{""decision"": ""bat"", ""winner"": ""South Africa""}","""Perth""","""Western Australia Cricket Association Ground"""
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,"{""by"": {""innings"": 1, ""runs"": 80}, ""winner"": ""...","{""decision"": ""field"", ""winner"": ""South Africa""}","""Hobart""","""Bellerive Oval"""
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"{""by"": {""wickets"": 7}, ""winner"": ""Australia""}","{""decision"": ""bat"", ""winner"": ""South Africa""}",,"""Adelaide Oval"""
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,"{""by"": {""runs"": 39}, ""winner"": ""Australia""}","{""decision"": ""bat"", ""winner"": ""Australia""}","""Brisbane""","""Brisbane Cricket Ground, Woolloongabba"""
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,"{""by"": {""innings"": 1, ""runs"": 18}, ""winner"": ""...","{""decision"": ""bat"", ""winner"": ""Pakistan""}",,"""Melbourne Cricket Ground"""


In [15]:
# Step 2: Define the Function
def get_unique_teams_by_match_type(matches_df):
    unique_teams_by_match_type = {}
    
    for match_type in matches_df['match_type'].unique():
        # Filter DataFrame for the current match type
        filtered_df = matches_df[matches_df['match_type'] == match_type]
        
        # Extract unique team names from both columns
        unique_teams = set(filtered_df['team_involved_one']).union(set(filtered_df['team_involved_two']))
        
        # Assign the unique team names to the dictionary
        unique_teams_by_match_type[match_type] = unique_teams
        
    return unique_teams_by_match_type

# Step 3: Get the Unique Teams by Match Type
unique_teams_by_match_type = get_unique_teams_by_match_type(matches_df)

# Step 4: Print the Results
for match_type, teams in unique_teams_by_match_type.items():
    print(f"{match_type}: {teams}")

Test: {'Sri Lanka', 'India', 'Afghanistan', 'ICC World XI', 'West Indies', 'England', 'New Zealand', 'Bangladesh', 'Australia', 'South Africa', 'Zimbabwe', 'Pakistan', 'Ireland'}
ODI: {'Hong Kong', 'Sri Lanka', 'India', 'Netherlands', 'Oman', 'Namibia', 'West Indies', 'England', 'New Zealand', 'Scotland', 'United Arab Emirates', 'Africa XI', 'Canada', 'Jersey', 'Papua New Guinea', 'United States of America', 'Afghanistan', 'Nepal', 'ICC World XI', 'South Africa', 'Thailand', 'Asia XI', 'Australia', 'Kenya', 'Bangladesh', 'Zimbabwe', 'Pakistan', 'Ireland', 'Bermuda'}
T20: {'Hong Kong', 'Kuwait', 'Sri Lanka', 'Oman', 'Bahamas', 'West Indies', 'England', 'Uganda', 'Bhutan', 'Romania', 'Cayman Islands', 'Austria', 'Brazil', 'Hungary', 'Eswatini', 'Czech Republic', 'Panama', 'Bulgaria', 'Jersey', 'Fiji', 'ICC World XI', 'Cambodia', 'Thailand', 'Cyprus', 'Saudi Arabia', 'Finland', 'Seychelles', 'Bahrain', 'Turkey', 'Australia', 'Kenya', 'Luxembourg', 'Mexico', 'Bermuda', 'India', 'Chile', 'E

In [16]:
from fuzzywuzzy import process

def find_similar_names_per_match_type(unique_teams_by_match_type, threshold=60):
    similar_names_by_match_type = {}

    for match_type, teams in unique_teams_by_match_type.items():
        teams_list = list(teams)
        potential_matches = {}
        
        for team in teams_list:
            matches = process.extract(team, teams_list, limit=None)
            similar_teams = [match[0] for match in matches if match[1] >= threshold and match[0] != team]
            if similar_teams:
                potential_matches[team] = similar_teams
        
        if potential_matches:
            similar_names_by_match_type[match_type] = potential_matches

    return similar_names_by_match_type

# Execute the function
similar_names_per_match_type = find_similar_names_per_match_type(unique_teams_by_match_type)

# Print the results
for match_type, similar_names in similar_names_per_match_type.items():
    print(f"{match_type}:")
    for team, matches in similar_names.items():
        print(f"  {team}: {matches}")


Test:
  India: ['West Indies']
  Afghanistan: ['Pakistan']
  West Indies: ['India']
  England: ['Ireland', 'New Zealand']
  New Zealand: ['England', 'Ireland']
  Pakistan: ['Afghanistan']
  Ireland: ['England', 'New Zealand']
ODI:
  India: ['West Indies']
  Netherlands: ['England', 'New Zealand', 'Ireland', 'Thailand']
  West Indies: ['India']
  England: ['Ireland', 'Netherlands', 'New Zealand']
  New Zealand: ['Netherlands', 'England', 'Ireland']
  Scotland: ['Thailand']
  United Arab Emirates: ['United States of America']
  Africa XI: ['South Africa', 'Asia XI']
  Papua New Guinea: ['Nepal']
  United States of America: ['United Arab Emirates', 'South Africa']
  Afghanistan: ['Pakistan']
  Nepal: ['Papua New Guinea']
  ICC World XI: ['Asia XI']
  South Africa: ['Africa XI', 'United States of America']
  Thailand: ['Ireland', 'Netherlands', 'Scotland']
  Asia XI: ['ICC World XI', 'Africa XI']
  Pakistan: ['Afghanistan']
  Ireland: ['England', 'Thailand', 'Netherlands', 'New Zealand']
T

In [17]:
# Define the mapping of old team names to current team names
team_name_mapping = {
    ('North-West Warriors',): 'North West Warriors',
    ('St Lucia Stars', 'St Lucia Zouks'): 'St Lucia Kings',
    ('Barbados Tridents',): 'Barbados Royals',
    ('St Lucia Zouks', 'St Lucia Kings'): 'St Lucia Kings',
	('Comilla Victorians',): 'Cumilla Warriors',
	('Chittagong Kings', 'Chittagong Vikings'): 'Chattogram Challengers',
	('Khulna Royal Bengals', 'Khulna Titans'): 'Khulna Tigers',
	('Barishal Burners', 'Barishal Bulls'): 'Fortune Barishal',
	('Dhaka Gladiators', 'Dhaka Dynamites', 'Dhaka Platoon', 'Beximco Dhaka', 'Minister Dhaka', 'Minister Group Dhaka', 'Dhaka Dominators'): 'Durdanto Dhaka',
    ('Sylhet Royals', 'Sylhet Super Stars', 'Sylhet Sixers', 'Sylhet Thunder', 'Sylhet Sunrisers'): 'Sylhet Strikers',
	('Rangpur Riders',): 'Rangpur Rangers',
	('Duronto Rajshahi', 'Rajshahi Kings'): 'Rajshahi Royals',
	('Royal Challengers Bangalore', ): 'Royal Challengers Bengaluru',
	('Delhi Daredevils',): 'Delhi Capitals',
	('Kings XI Punjab',): 'Punjab Kings',
	('Colombo Kings', 'Colombo Stars',): 'Colombo Strikers',
	('Dambulla Viiking', 'Dambulla Giants', 'Dambulla Aura',): 'Dambulla Sixers',
	('Galle Gladiators', 'Galle Titans'): 'Galle Marvels',
	('Kandy Tuskers', 'Kandy Warriors', 'Kandy Falcons'): 'B-Love Kandy',
	('Jaffna Stallions',): 'Jaffna Kings',
	# Add other mappings as needed
}

In [18]:
# Define the function to replace team names
def replace_team_names(matches_df, team_name_mapping):
    for old_names, new_name in team_name_mapping.items():
        for old_name in old_names:
            matches_df['team_involved_one'] = matches_df['team_involved_one'].replace(old_name, new_name)
            matches_df['team_involved_two'] = matches_df['team_involved_two'].replace(old_name, new_name)
    return matches_df

In [19]:
# Replace team names in the original DataFrame
matches_df = replace_team_names(matches_df, team_name_mapping)

# Display the updated DataFrame
print(matches_df.head())

  match_id gender  start_date     teams_type match_type team_involved_one  \
0  1000851   male  2016-11-03  international       Test         Australia   
1  1000853   male  2016-11-12  international       Test         Australia   
2  1000855   male  2016-11-24  international       Test         Australia   
3  1000881   male  2016-12-15  international       Test         Australia   
4  1000883   male  2016-12-26  international       Test         Australia   

  team_involved_two                                            outcome  \
0      South Africa    {"by": {"runs": 177}, "winner": "South Africa"}   
1      South Africa  {"by": {"innings": 1, "runs": 80}, "winner": "...   
2      South Africa      {"by": {"wickets": 7}, "winner": "Australia"}   
3          Pakistan        {"by": {"runs": 39}, "winner": "Australia"}   
4          Pakistan  {"by": {"innings": 1, "runs": 18}, "winner": "...   

                                              toss        city  \
0    {"decision": "bat", "

In [20]:
# Step 2: Define the Function
def get_unique_teams_by_match_type(matches_df):
    unique_teams_by_match_type = {}
    
    for match_type in matches_df['match_type'].unique():
        # Filter DataFrame for the current match type
        filtered_df = matches_df[matches_df['match_type'] == match_type]
        
        # Extract unique team names from both columns
        unique_teams = set(filtered_df['team_involved_one']).union(set(filtered_df['team_involved_two']))
        
        # Assign the unique team names to the dictionary
        unique_teams_by_match_type[match_type] = unique_teams
        
    return unique_teams_by_match_type

# Step 3: Get the Unique Teams by Match Type
unique_teams_by_match_type = get_unique_teams_by_match_type(matches_df)

# Step 4: Print the Results
for match_type, teams in unique_teams_by_match_type.items():
    print(f"{match_type}: {teams}")

Test: {'Sri Lanka', 'India', 'Afghanistan', 'ICC World XI', 'West Indies', 'England', 'New Zealand', 'Bangladesh', 'Australia', 'South Africa', 'Zimbabwe', 'Pakistan', 'Ireland'}
ODI: {'Hong Kong', 'Sri Lanka', 'India', 'Netherlands', 'Oman', 'Namibia', 'West Indies', 'England', 'New Zealand', 'Scotland', 'United Arab Emirates', 'Africa XI', 'Canada', 'Jersey', 'Papua New Guinea', 'United States of America', 'Afghanistan', 'Nepal', 'ICC World XI', 'South Africa', 'Thailand', 'Asia XI', 'Australia', 'Kenya', 'Bangladesh', 'Zimbabwe', 'Pakistan', 'Ireland', 'Bermuda'}
T20: {'Hong Kong', 'Kuwait', 'Sri Lanka', 'Oman', 'Bahamas', 'West Indies', 'England', 'Uganda', 'Bhutan', 'Romania', 'Cayman Islands', 'Austria', 'Brazil', 'Hungary', 'Eswatini', 'Czech Republic', 'Panama', 'Bulgaria', 'Jersey', 'Fiji', 'ICC World XI', 'Cambodia', 'Thailand', 'Cyprus', 'Saudi Arabia', 'Finland', 'Seychelles', 'Bahrain', 'Turkey', 'Australia', 'Kenya', 'Luxembourg', 'Mexico', 'Bermuda', 'India', 'Chile', 'E

In [21]:
from fuzzywuzzy import process

def find_similar_names_per_match_type(unique_teams_by_match_type, threshold=60):
    similar_names_by_match_type = {}

    for match_type, teams in unique_teams_by_match_type.items():
        teams_list = list(teams)
        potential_matches = {}
        
        for team in teams_list:
            matches = process.extract(team, teams_list, limit=None)
            similar_teams = [match[0] for match in matches if match[1] >= threshold and match[0] != team]
            if similar_teams:
                potential_matches[team] = similar_teams
        
        if potential_matches:
            similar_names_by_match_type[match_type] = potential_matches

    return similar_names_by_match_type

# Execute the function
similar_names_per_match_type = find_similar_names_per_match_type(unique_teams_by_match_type)

# Print the results
for match_type, similar_names in similar_names_per_match_type.items():
    print(f"{match_type}:")
    for team, matches in similar_names.items():
        print(f"  {team}: {matches}")

Test:
  India: ['West Indies']
  Afghanistan: ['Pakistan']
  West Indies: ['India']
  England: ['Ireland', 'New Zealand']
  New Zealand: ['England', 'Ireland']
  Pakistan: ['Afghanistan']
  Ireland: ['England', 'New Zealand']
ODI:
  India: ['West Indies']
  Netherlands: ['England', 'New Zealand', 'Ireland', 'Thailand']
  West Indies: ['India']
  England: ['Ireland', 'Netherlands', 'New Zealand']
  New Zealand: ['Netherlands', 'England', 'Ireland']
  Scotland: ['Thailand']
  United Arab Emirates: ['United States of America']
  Africa XI: ['South Africa', 'Asia XI']
  Papua New Guinea: ['Nepal']
  United States of America: ['United Arab Emirates', 'South Africa']
  Afghanistan: ['Pakistan']
  Nepal: ['Papua New Guinea']
  ICC World XI: ['Asia XI']
  South Africa: ['Africa XI', 'United States of America']
  Thailand: ['Ireland', 'Netherlands', 'Scotland']
  Asia XI: ['ICC World XI', 'Africa XI']
  Pakistan: ['Afghanistan']
  Ireland: ['England', 'Thailand', 'Netherlands', 'New Zealand']
T

In [22]:
def matches_per_team(teams_type):
    # Filter matches based on the provided team type
    filtered_matches = matches_df[matches_df['teams_type'] == teams_type]
    
    # Create a list of all teams involved in the filtered matches
    all_teams = pd.concat([filtered_matches['team_involved_one'], filtered_matches['team_involved_two']]).unique()
    
    # Initialize a dictionary to store the count of matches per team
    matches_per_team = {team: 0 for team in all_teams}
    
    # Update the matches count for each team
    for team in all_teams:
        matches_per_team[team] = ((filtered_matches['team_involved_one'] == team) | (filtered_matches['team_involved_two'] == team)).sum()
    
    # Sort the dictionary items by value in descending order
    matches_per_team_sorted = dict(sorted(matches_per_team.items(), key=lambda item: item[1], reverse=True))
    
    return matches_per_team_sorted

In [23]:
club_match_per_team = matches_per_team("club")
print(club_match_per_team)

{'Hampshire': 362, 'Nottinghamshire': 360, 'Somerset': 356, 'Essex': 355, 'Surrey': 354, 'Kent': 353, 'Worcestershire': 352, 'Lancashire': 351, 'Durham': 349, 'Gloucestershire': 348, 'Yorkshire': 347, 'Middlesex': 346, 'Northamptonshire': 345, 'Sussex': 345, 'Leicestershire': 342, 'Glamorgan': 339, 'Derbyshire': 337, 'Mumbai Indians': 280, 'Warwickshire': 274, 'Royal Challengers Bengaluru': 273, 'Delhi Capitals': 270, 'Perth Scorchers': 265, 'Sydney Sixers': 261, 'Adelaide Strikers': 258, 'Brisbane Heat': 258, 'Melbourne Stars': 254, 'Kolkata Knight Riders': 251, 'Punjab Kings': 246, 'Melbourne Renegades': 243, 'Hobart Hurricanes': 242, 'Sydney Thunder': 241, 'Chennai Super Kings': 238, 'Rajasthan Royals': 221, 'Canterbury': 203, 'Northern Districts': 199, 'Sunrisers Hyderabad': 182, 'Guyana Amazon Warriors': 127, 'Jamaica Tallawahs': 117, 'Barbados Royals': 117, 'Otago': 117, 'Durdanto Dhaka': 116, 'Wellington': 115, 'Central Districts': 114, 'Auckland': 113, 'Southern Vipers': 111, '

In [24]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,outcome,toss,city,venue
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,"{""by"": {""runs"": 177}, ""winner"": ""South Africa""}","{""decision"": ""bat"", ""winner"": ""South Africa""}","""Perth""","""Western Australia Cricket Association Ground"""
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,"{""by"": {""innings"": 1, ""runs"": 80}, ""winner"": ""...","{""decision"": ""field"", ""winner"": ""South Africa""}","""Hobart""","""Bellerive Oval"""
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"{""by"": {""wickets"": 7}, ""winner"": ""Australia""}","{""decision"": ""bat"", ""winner"": ""South Africa""}",,"""Adelaide Oval"""
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,"{""by"": {""runs"": 39}, ""winner"": ""Australia""}","{""decision"": ""bat"", ""winner"": ""Australia""}","""Brisbane""","""Brisbane Cricket Ground, Woolloongabba"""
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,"{""by"": {""innings"": 1, ""runs"": 18}, ""winner"": ""...","{""decision"": ""bat"", ""winner"": ""Pakistan""}",,"""Melbourne Cricket Ground"""


In [25]:
def transform_dataframe(df):
    # 1. Change the start_date to datetime
    df['start_date'] = pd.to_datetime(df['start_date'])

    # 2. Split the toss field into toss_winner and toss_decision
    def extract_toss_details(toss):
        toss_data = json.loads(toss)  # Parse the JSON string to a dictionary
        return toss_data.get('winner'), toss_data.get('decision')
    
    df[['toss_winner', 'toss_decision']] = df['toss'].apply(lambda x: pd.Series(extract_toss_details(x)))

    # Drop the original toss column
    df.drop(columns=['toss'], inplace=True)

    # 3. Remove double quotes from the city and venue field
    df['city'] = df['city'].str.strip('"')
    df['venue'] = df['venue'].str.strip('"')

    return df

In [26]:
matches_df = transform_dataframe(matches_df)

In [27]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,outcome,city,venue,toss_winner,toss_decision
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,"{""by"": {""runs"": 177}, ""winner"": ""South Africa""}",Perth,Western Australia Cricket Association Ground,South Africa,bat
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,"{""by"": {""innings"": 1, ""runs"": 80}, ""winner"": ""...",Hobart,Bellerive Oval,South Africa,field
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"{""by"": {""wickets"": 7}, ""winner"": ""Australia""}",,Adelaide Oval,South Africa,bat
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,"{""by"": {""runs"": 39}, ""winner"": ""Australia""}",Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,"{""by"": {""innings"": 1, ""runs"": 18}, ""winner"": ""...",,Melbourne Cricket Ground,Pakistan,bat


In [28]:
def process_outcome(outcome):
    # Initialize default values
    result = winner = runs = innings = wickets = method = None

    # Parse the JSON string to a dictionary
    outcome_data = json.loads(outcome)

    # Extract the common fields
    if 'result' in outcome_data:
        result = outcome_data['result']
    if 'winner' in outcome_data:
        winner = outcome_data['winner']
    if 'method' in outcome_data:
        method = outcome_data['method']
    if 'by' in outcome_data:
        by = outcome_data['by']
        if 'runs' in by:
            runs = by['runs']
        if 'innings' in by:
            innings = by['innings']
        if 'wickets' in by:
            wickets = by['wickets']

    return result, winner, runs, innings, wickets, method

# Apply the function to the outcome column and create new columns
matches_df[['result', 'winner', 'runs', 'innings', 'wickets', 'method']] = matches_df['outcome'].apply(
    lambda x: pd.Series(process_outcome(x))
)

# Drop the original outcome column
matches_df.drop(columns=['outcome'], inplace=True)

In [29]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,city,venue,toss_winner,toss_decision,result,winner,runs,innings,wickets,method
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,Perth,Western Australia Cricket Association Ground,South Africa,bat,,South Africa,177.0,,,
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,Hobart,Bellerive Oval,South Africa,field,,South Africa,80.0,1.0,,
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,,Adelaide Oval,South Africa,bat,,Australia,,,7.0,
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,Brisbane,"Brisbane Cricket Ground, Woolloongabba",Australia,bat,,Australia,39.0,,,
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,,Melbourne Cricket Ground,Pakistan,bat,,Australia,18.0,1.0,,


In [30]:
import pandas as pd

# Create a mapping of venues to cities
def create_venue_city_mapping(df):
    venue_city_mapping = df.dropna(subset=['venue', 'city']).drop_duplicates(subset=['venue', 'city'])
    venue_city_dict = dict(zip(venue_city_mapping['venue'], venue_city_mapping['city']))
    return venue_city_dict

# Fill missing city values based on the venue
def fill_missing_cities(df, venue_city_dict):
    df['city'] = df.apply(
        lambda row: venue_city_dict[row['venue']] if pd.isnull(row['city']) and row['venue'] in venue_city_dict else row['city'],
        axis=1
    )
    return df

# Create the venue to city mapping
venue_city_dict = create_venue_city_mapping(matches_df)

# Fill the missing city values
matches_df = fill_missing_cities(matches_df, venue_city_dict)

# Check if the missing city values have been filled
print(matches_df[['venue', 'city']].head(20))  # Adjust the number of rows to print as needed


                                            venue      city
0    Western Australia Cricket Association Ground     Perth
1                                  Bellerive Oval    Hobart
2                                   Adelaide Oval      null
3          Brisbane Cricket Ground, Woolloongabba  Brisbane
4                        Melbourne Cricket Ground      null
5                           Sydney Cricket Ground      null
6          Brisbane Cricket Ground, Woolloongabba  Brisbane
7                        Melbourne Cricket Ground      null
8    Western Australia Cricket Association Ground     Perth
9                           Sydney Cricket Ground      null
10                                  Adelaide Oval      null
11                       Melbourne Cricket Ground      null
12                 Simonds Stadium, South Geelong  Victoria
13                                  Adelaide Oval      null
14                          Sydney Cricket Ground      null
15                                    Ma

In [31]:
import requests
from bs4 import BeautifulSoup

# Function to fetch city for a given venue
def fetch_city_for_venue(venue):
    try:
        search_url = f"https://en.wikipedia.org/wiki/{venue.replace(' ', '_')}"
        print(search_url)
        response = requests.get(search_url)
        if response.status_code != 200:
            print(f"Error fetching page for {venue}: Status code {response.status_code}")
            return None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', {'class': 'infobox'})

        if infobox:
            row = infobox.find('th', text='Location')
            if row:
                city = row.find_next_sibling('td').text.strip()
                return city
        
        print(f"Error fetching city for {venue}: 'Location' row not found")
        return None

    except Exception as e:
        print(f"Error fetching city for {venue}: {e}")
        return None

# List of venues
venues = [
    "Adelaide Oval",
    "Melbourne Cricket Ground",
    "Sydney Cricket Ground",
    "Harare Sports Club",
    "Sydney Showground Stadium",
    "Warner Park, Basseterre",
    "Rangiri Dambulla International Stadium",
    "Johor Cricket Academy Oval",
    "Dubai International Cricket Stadium",
    "Sharjah Cricket Stadium",
    "Saxton Oval",
    "Hagley Oval",
    "Bay Oval",
    "Cello Basin Reserve",
    "Kennards Hire Community Oval",
    "Seddon Park",
    "University of Otago Oval",
    "Nelson Park",
    "Mainpower Oval",
    "Colin Maiden Park",
    "Cobham Oval",
    "Eden Park",
    "Westpac Stadium",
    "Sano International Cricket Ground",
    "Arundel Castle Cricket Club Ground",
    "Uxbridge Cricket Club Ground",
    "Perth Stadium",
    "Colombo Cricket Club Ground",
    "Galle International Stadium",
    "Royal Chiangmai Golf Club",
    "Entebbe Cricket Oval",
    "Pallekele International Cricket Stadium",
    "Molyneux Park",
    "McLean Park",
    "Saurashtra Cricket Association Stadium",
    "Shaheed Veer Narayan Singh International Stadium",
    "Arun Jaitley Stadium",
    "Dr. Y.S. Rajasekhara Reddy ACA VDCA Cricket Stadium",
    "JSCA International Stadium Complex",
    "Dr P.V.G. Raju ACA Sports Complex",
    "Sylhet International Cricket Stadium",
    "Bulawayo Athletic Club",
    "JU Second Campus, Salt Lake",
    "Eden Gardens",
    "Carrara Oval",
    "Gahanga International Cricket Stadium, Rwanda",
    "Dr. Gokaraju Laila Ganga Raju ACA Cricket Complex - DVR Ground, Mulapadu",
    "Dr. Gokaraju Laila Ganga Raju ACA Cricket Complex - CP Ground, Mulapadu",
    "Lalbhai Contractor Stadium",
    "C B Patel Ground",
    "Holkar Stadium",
    "Emerald Heights International School Ground",
    "Barabati Stadium",
    "DRIEMS Ground",
    "Airforce Complex ground, Palam",
    "Airforce Complex ground, Palam II",
    "Hong Kong Cricket Club",
    "Al Amerat Cricket Ground Oman Cricket (Ministry Turf 1)",
    "Al Amerat Cricket Ground Oman Cricket (Ministry Turf 2)",
    "Queen's Park",
    "Sir Vivian Richards Stadium, North Sound",
    "St'Xavier's KCA Cricket Ground",
    "Greenfield Stadium",
    "Cricket Stadium, Sector-16",
    "GSSS, Sector 26",
    "BKC Ground",
    "Wankhede Stadium",
    "Rawalpindi Cricket Stadium",
    "Moara Vlasiei Cricket Ground",
    "Alur Cricket Stadium",
    "Alur Cricket Stadium II",
    "Alur Cricket Stadium III",
    "Jadavpur University Campus",
    "Motibaug Cricket Ground",
    "F B Colony Ground",
    "Reliance Cricket Stadium",
    "Sharad Pawar Cricket Academy BKC",
    "SSN College Ground",
    "T I Murugappa Ground",
    "Sri Ramachandra Medical College",
    "IC-Gurunanak College Ground",
    "Narendra Modi Stadium Ground 'A', Motera",
    "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
    "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium B",
    "ACA Stadium, Barsapara",
    "Nehru Stadium",
    "Gurugram Cricket Ground (SRNCC)",
    "Chaudhry Bansi Lal Cricket Stadium",
    "Gokaraju Liala Gangaaraju ACA Cricket Ground",
    "ACA Stadium, Mangalagiri",
    "Alembic 2 Cricket Ground",
    "County Ground, Chelmsford",
    "Fitzherbert Park",
    "Tafawa Balewa Square (TBS) Cricket Oval",
    "San Albano",
    "St Georges Quilmes",
    "Multan Cricket Stadium",
    "Chittagong Divisional Stadium",
    "Queenstown Events Centre",
    "Stellenbosch University 1",
    "Stellenbosch University 2",
    "Sheikhupura Stadium",
    "Dubai Sports City Cricket Stadium",
    "Mombasa Sports Club Ground",
    "Sharjah Cricket Association Stadium",
    "Guanggong International Cricket Stadium",
    "Louth Cricket Club",
    "Sylhet Stadium",
    "Al Dhaid Cricket Village",
    "Adelaide Oval No. 2",
    "Vidarbha Cricket Association Stadium, Jamtha",
    "VCA Ground",
    "Jawaharlal Nehru Stadium",
    "St Paul's College Ground, Kalamassery",
    "Alembic 1 Cricket Ground",
    "West Mersea Cricket Club",
    "Guyana National Stadium, Providence"
]

# Dictionary to store the mapping of venue to city
venue_city_mapping = {}

# Fetch city for each venue
for venue in venues:
    city = fetch_city_for_venue(venue)
    if city:
        venue_city_mapping[venue] = city

# Print the venue to city mapping
for venue, city in venue_city_mapping.items():
    print(f"{venue}: {city}")

# Save the mapping to a file or use it as needed
import json

with open("venue_city_mapping.json", "w") as file:
    json.dump(venue_city_mapping, file)


https://en.wikipedia.org/wiki/Adelaide_Oval


  row = infobox.find('th', text='Location')


https://en.wikipedia.org/wiki/Melbourne_Cricket_Ground
https://en.wikipedia.org/wiki/Sydney_Cricket_Ground
https://en.wikipedia.org/wiki/Harare_Sports_Club
https://en.wikipedia.org/wiki/Sydney_Showground_Stadium
https://en.wikipedia.org/wiki/Warner_Park,_Basseterre
https://en.wikipedia.org/wiki/Rangiri_Dambulla_International_Stadium
https://en.wikipedia.org/wiki/Johor_Cricket_Academy_Oval
Error fetching page for Johor Cricket Academy Oval: Status code 404
https://en.wikipedia.org/wiki/Dubai_International_Cricket_Stadium
Error fetching city for Dubai International Cricket Stadium: 'Location' row not found
https://en.wikipedia.org/wiki/Sharjah_Cricket_Stadium
https://en.wikipedia.org/wiki/Saxton_Oval
https://en.wikipedia.org/wiki/Hagley_Oval
https://en.wikipedia.org/wiki/Bay_Oval
https://en.wikipedia.org/wiki/Cello_Basin_Reserve
Error fetching page for Cello Basin Reserve: Status code 404
https://en.wikipedia.org/wiki/Kennards_Hire_Community_Oval
Error fetching page for Kennards Hire Com

Error fetching page for ACA Stadium, Mangalagiri: Status code 404
https://en.wikipedia.org/wiki/Alembic_2_Cricket_Ground
Error fetching page for Alembic 2 Cricket Ground: Status code 404
https://en.wikipedia.org/wiki/County_Ground,_Chelmsford
https://en.wikipedia.org/wiki/Fitzherbert_Park
https://en.wikipedia.org/wiki/Tafawa_Balewa_Square_(TBS)_Cricket_Oval
Error fetching page for Tafawa Balewa Square (TBS) Cricket Oval: Status code 404
https://en.wikipedia.org/wiki/San_Albano
Error fetching page for San Albano: Status code 404
https://en.wikipedia.org/wiki/St_Georges_Quilmes
Error fetching page for St Georges Quilmes: Status code 404
https://en.wikipedia.org/wiki/Multan_Cricket_Stadium
https://en.wikipedia.org/wiki/Chittagong_Divisional_Stadium
https://en.wikipedia.org/wiki/Queenstown_Events_Centre
https://en.wikipedia.org/wiki/Stellenbosch_University_1
Error fetching page for Stellenbosch University 1: Status code 404
https://en.wikipedia.org/wiki/Stellenbosch_University_2
Error fetc

In [32]:
import pandas as pd
import json

# Assuming matches_df is already created and has columns 'venue' and 'city'

# Load the venue-city mapping from the JSON file
with open("venue_city_mapping.json", "r") as file:
    venue_city_mapping = json.load(file)

# Function to fill missing city values based on the venue
def fill_missing_city(row):
    if row['city'] == "null" and row['venue'] in venue_city_mapping:
        return venue_city_mapping[row['venue']]
    return row['city']

# Apply the function to fill missing city values
matches_df['city'] = matches_df.apply(fill_missing_city, axis=1)

# Check if any city values are still "null"
missing_cities = matches_df[matches_df['city'] == "null"]

# Verify the DataFrame
print(matches_df.head())

# Print the rows with "null" city values
if not missing_cities.empty:
    print("Rows with 'null' city values:")
    print(missing_cities)
else:
    print("No 'null' city values.")


  match_id gender start_date     teams_type match_type team_involved_one  \
0  1000851   male 2016-11-03  international       Test         Australia   
1  1000853   male 2016-11-12  international       Test         Australia   
2  1000855   male 2016-11-24  international       Test         Australia   
3  1000881   male 2016-12-15  international       Test         Australia   
4  1000883   male 2016-12-26  international       Test         Australia   

  team_involved_two                                               city  \
0      South Africa                                              Perth   
1      South Africa                                             Hobart   
2      South Africa  War Memorial DriveNorth Adelaide, South Austra...   
3          Pakistan                                           Brisbane   
4          Pakistan                                         Yarra Park   

                                          venue   toss_winner toss_decision  \
0  Western Australi

In [33]:
import pandas as pd
import json

# Assuming matches_df is already created and has columns 'venue' and 'city'

# Load the venue-city mapping from the JSON file
with open("venue_city_mapping.json", "r") as file:
    venue_city_mapping = json.load(file)

# Function to fill missing city values based on the venue
def fill_missing_city(row):
    if row['city'] == "null" and row['venue'] in venue_city_mapping:
        return venue_city_mapping[row['venue']]
    return row['city']

# Apply the function to fill missing city values
matches_df['city'] = matches_df.apply(fill_missing_city, axis=1)

# Check if any city values are still "null"
missing_cities = matches_df[matches_df['city'] == "null"]

# Verify the DataFrame
print(matches_df.head())

# Print the rows with "null" city values along with their venues
if not missing_cities.empty:
    print("Rows with 'null' city values and their corresponding venues:")
    print(missing_cities[['venue', 'city']])
else:
    print("No 'null' city values.")


  match_id gender start_date     teams_type match_type team_involved_one  \
0  1000851   male 2016-11-03  international       Test         Australia   
1  1000853   male 2016-11-12  international       Test         Australia   
2  1000855   male 2016-11-24  international       Test         Australia   
3  1000881   male 2016-12-15  international       Test         Australia   
4  1000883   male 2016-12-26  international       Test         Australia   

  team_involved_two                                               city  \
0      South Africa                                              Perth   
1      South Africa                                             Hobart   
2      South Africa  War Memorial DriveNorth Adelaide, South Austra...   
3          Pakistan                                           Brisbane   
4          Pakistan                                         Yarra Park   

                                          venue   toss_winner toss_decision  \
0  Western Australi

In [34]:
import pandas as pd
import json

# Assuming matches_df is already created and has columns 'venue' and 'city'

# Load the venue-city mapping from the JSON file
with open("venue_city_mapping.json", "r") as file:
    venue_city_mapping = json.load(file)

# Function to fill missing city values based on the venue
def fill_missing_city(row):
    if row['city'] == "null" and row['venue'] in venue_city_mapping:
        return venue_city_mapping[row['venue']]
    return row['city']

# Apply the function to fill missing city values
matches_df['city'] = matches_df.apply(fill_missing_city, axis=1)

# Check if any city values are still "null"
missing_cities = matches_df[matches_df['city'] == "null"]

# Verify the DataFrame
print(matches_df.head())

# Print the rows with "null" city values along with their venues grouped by venue
if not missing_cities.empty:
    print("Rows with 'null' city values grouped by venue:")
    grouped_missing_cities = missing_cities.groupby('venue')['city'].count()
    print(grouped_missing_cities)
else:
    print("No 'null' city values.")


  match_id gender start_date     teams_type match_type team_involved_one  \
0  1000851   male 2016-11-03  international       Test         Australia   
1  1000853   male 2016-11-12  international       Test         Australia   
2  1000855   male 2016-11-24  international       Test         Australia   
3  1000881   male 2016-12-15  international       Test         Australia   
4  1000883   male 2016-12-26  international       Test         Australia   

  team_involved_two                                               city  \
0      South Africa                                              Perth   
1      South Africa                                             Hobart   
2      South Africa  War Memorial DriveNorth Adelaide, South Austra...   
3          Pakistan                                           Brisbane   
4          Pakistan                                         Yarra Park   

                                          venue   toss_winner toss_decision  \
0  Western Australi

In [35]:
import pandas as pd
import json

# Assuming matches_df is already created and has columns 'venue' and 'city'

# Load the venue-city mapping from the JSON file
with open("venue_city_mapping.json", "r") as file:
    venue_city_mapping = json.load(file)

# Function to fill missing city values based on the venue
def fill_missing_city(row):
    if row['city'] == "null" and row['venue'] in venue_city_mapping:
        return venue_city_mapping[row['venue']]
    return row['city']

# Apply the function to fill missing city values
matches_df['city'] = matches_df.apply(fill_missing_city, axis=1)

# Check if any city values are still "null"
missing_cities = matches_df[matches_df['city'] == "null"]

# Verify the DataFrame
print(matches_df.head())

# Print the rows with "null" city values along with their venues grouped by venue
if not missing_cities.empty:
    print("Rows with 'null' city values grouped by venue:")
    grouped_missing_cities = missing_cities.groupby('venue')['city'].count()
    print(grouped_missing_cities)
else:
    print("No 'null' city values.")


  match_id gender start_date     teams_type match_type team_involved_one  \
0  1000851   male 2016-11-03  international       Test         Australia   
1  1000853   male 2016-11-12  international       Test         Australia   
2  1000855   male 2016-11-24  international       Test         Australia   
3  1000881   male 2016-12-15  international       Test         Australia   
4  1000883   male 2016-12-26  international       Test         Australia   

  team_involved_two                                               city  \
0      South Africa                                              Perth   
1      South Africa                                             Hobart   
2      South Africa  War Memorial DriveNorth Adelaide, South Austra...   
3          Pakistan                                           Brisbane   
4          Pakistan                                         Yarra Park   

                                          venue   toss_winner toss_decision  \
0  Western Australi

In [36]:
# Drop the 'venue' column from the DataFrame
matches_df = matches_df.drop(columns=['venue'])

# Verify the DataFrame
print(matches_df.head())

  match_id gender start_date     teams_type match_type team_involved_one  \
0  1000851   male 2016-11-03  international       Test         Australia   
1  1000853   male 2016-11-12  international       Test         Australia   
2  1000855   male 2016-11-24  international       Test         Australia   
3  1000881   male 2016-12-15  international       Test         Australia   
4  1000883   male 2016-12-26  international       Test         Australia   

  team_involved_two                                               city  \
0      South Africa                                              Perth   
1      South Africa                                             Hobart   
2      South Africa  War Memorial DriveNorth Adelaide, South Austra...   
3          Pakistan                                           Brisbane   
4          Pakistan                                         Yarra Park   

    toss_winner toss_decision result        winner   runs  innings  wickets  \
0  South Africa    

In [37]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,city,toss_winner,toss_decision,result,winner,runs,innings,wickets,method
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,Perth,South Africa,bat,,South Africa,177.0,,,
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,Hobart,South Africa,field,,South Africa,80.0,1.0,,
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"War Memorial DriveNorth Adelaide, South Austra...",South Africa,bat,,Australia,,,7.0,
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,Brisbane,Australia,bat,,Australia,39.0,,,
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,Yarra Park,Pakistan,bat,,Australia,18.0,1.0,,


In [38]:
# Group by 'city' and count the number of matches
city_counts = matches_df.groupby('city').size().reset_index(name='match_count')

# Print the grouped data
city_counts


Unnamed: 0,city,match_count
0,"300-1 Tochimoto-cho \nSano, Tochigi-ken 327-03...",11
1,Aberdeen,13
2,Abu Dhabi,298
3,Accra,25
4,Adelaide,152
...,...,...
374,Worcester,191
375,Wormsley,6
376,Yarra Park,90
377,York,12


In [39]:
import pandas as pd
from fuzzywuzzy import fuzz

# Assuming matches_df already exists and contains a column 'city'

# Step 1: Extract unique city names from the dataframe
unique_cities = matches_df['city'].dropna().unique()

# Step 2: Initialize an empty list to store similar city pairs
similar_city_pairs = []

# Step 3: Perform fuzzy matching
for i in range(len(unique_cities)):
    city1 = unique_cities[i]
    for j in range(i + 1, len(unique_cities)):
        city2 = unique_cities[j]
        similarity_ratio = fuzz.ratio(city1.lower(), city2.lower())
        if similarity_ratio > 60:  # Adjust the threshold as needed
            similar_city_pairs.append((city1, city2))

# Step 4: Print the list of similar city pairs
print("Similar city pairs found based on fuzzy matching:")
for pair in similar_city_pairs:
    print(pair)

Similar city pairs found based on fuzzy matching:
('Brisbane', 'Strabane')
('Victoria', 'Pretoria')
('Canberra', 'Canterbury')
('Canberra', 'Carrara')
('Canberra', 'Albergaria')
('Dublin', 'Durban')
('Dublin', 'Dunedin')
('Dublin', 'Dubai')
('Waringstown', 'Wellington')
('Waringstown', 'Brighton')
('Waringstown', 'Bridgetown')
('Waringstown', 'Kingston')
('Waringstown', 'Kingstown')
('Comber', 'Colombo')
('Edinburgh', 'Sedbergh')
('Durban', 'Dubai')
('Paarl', 'Pearland')
('Paarl', 'Peshawar')
('Benoni', 'Bendigo')
('Benoni', 'Chennai')
('Potchefstroom', 'Colchester')
('Port Elizabeth', 'Port Vila')
('East London', 'London')
('Amstelveen', 'Castel')
('Nairobi', 'Napier')
('Londonderry', 'London')
('Londonderry', 'Derry')
('Guyana', 'Guacima')
('Jamaica', 'Mackay')
('Christchurch', 'Christchurch, Canterbury')
('Nelson', 'Launceston')
('Nelson', 'Geelong')
('Nelson', 'Incheon')
('Napier', 'Nagpur')
('Napier', 'Navile')
('Wellington', 'Wellington, New Zealand')
('Wellington', 'Eglinton')
(

In [40]:
city_mapping = {
    "East London": "London",
    "Navi Mumbai": "Mumbai",
    "Kigali City": "Kigali",
    "Wong Nai Chung Gap, Hong Kong": "Wong Nai Chung Gap",
    "Dehra Dun": "Dehradun",
    "Dharamsala": "Dharmasala",
    "Delhi": "New Delhi",
    "Bangalore": "Bengaluru"
}

# Step 6: Use the mapping to replace city names in the dataframe
matches_df['city'] = matches_df['city'].replace(city_mapping)

# Verify the updated dataframe
print("\nUpdated dataframe with replaced city names:")
matches_df.head()


Updated dataframe with replaced city names:


Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,city,toss_winner,toss_decision,result,winner,runs,innings,wickets,method
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,Perth,South Africa,bat,,South Africa,177.0,,,
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,Hobart,South Africa,field,,South Africa,80.0,1.0,,
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"War Memorial DriveNorth Adelaide, South Austra...",South Africa,bat,,Australia,,,7.0,
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,Brisbane,Australia,bat,,Australia,39.0,,,
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,Yarra Park,Pakistan,bat,,Australia,18.0,1.0,,


In [41]:
# Group by 'city' and count the number of matches
city_counts = matches_df.groupby('city').size().reset_index(name='match_count')

# Print the grouped data
city_counts

Unnamed: 0,city,match_count
0,"300-1 Tochimoto-cho \nSano, Tochigi-ken 327-03...",11
1,Aberdeen,13
2,Abu Dhabi,298
3,Accra,25
4,Adelaide,152
...,...,...
368,Worcester,191
369,Wormsley,6
370,Yarra Park,90
371,York,12


In [42]:
matches_df.head()

Unnamed: 0,match_id,gender,start_date,teams_type,match_type,team_involved_one,team_involved_two,city,toss_winner,toss_decision,result,winner,runs,innings,wickets,method
0,1000851,male,2016-11-03,international,Test,Australia,South Africa,Perth,South Africa,bat,,South Africa,177.0,,,
1,1000853,male,2016-11-12,international,Test,Australia,South Africa,Hobart,South Africa,field,,South Africa,80.0,1.0,,
2,1000855,male,2016-11-24,international,Test,Australia,South Africa,"War Memorial DriveNorth Adelaide, South Austra...",South Africa,bat,,Australia,,,7.0,
3,1000881,male,2016-12-15,international,Test,Australia,Pakistan,Brisbane,Australia,bat,,Australia,39.0,,,
4,1000883,male,2016-12-26,international,Test,Australia,Pakistan,Yarra Park,Pakistan,bat,,Australia,18.0,1.0,,
