<h1>NBA correlation problem</h1>

In [3]:
def nba_correlation(): 
    # Importing necessary libraries
    import pandas as pd
    import numpy as np
    import scipy.stats as stats
    import re
    
    # Importing NBA data and city data
    nba_df = pd.read_csv("assets/nba.csv")  # Loading NBA data from a CSV file
    cities = pd.read_html("assets/wikipedia_data.html")[1]  # Loading city data from an HTML file (second table)
    cities = cities.iloc[:-1, [0, 3, 5, 6, 7, 8]]  # Selecting relevant columns and removing the last row
    cities.head()  # Displaying the first 5 rows of the city data

    # Defining a function to clean up city data and extract the relevant information for NBA teams 
    # This function was made by the same EDA done on other sports so I just made a function that does the same for all sports
    def clean_up_cities_rules(cities, sport): 
        # Renaming columns to easier-to-work-with names
        cities.rename(columns = {"Population (2016 est.)[8]":"Population", "Metropolitan area":"City"}, inplace=True)
        # Keeping only the relevant columns for the sport (i.e., city, population, and NBA info)
        cities = cities[["City", "Population", sport]]
        # Replacing any '--' values with NaN, making it easier to handle missing values
        cities.loc[:, sport] = cities.loc[:, sport].replace("—", np.nan)
        cities = cities.dropna()  # Dropping rows with missing values
        # Removing any extra information within square brackets in the sport column (e.g., team associations)
        cities.loc[:, sport] = cities.loc[:, sport].replace("\[.*\]", "", regex=True)
        # Replacing empty strings with NaN (empty values in the sport column imply no NBA team in that city)
        cities.loc[:, sport] = cities.loc[:, sport].replace("", np.nan)
        cities = cities.dropna()  # Dropping rows with missing values again
        return cities

    # Cleaning up city data specifically for NBA
    metro_rules_NBA = clean_up_cities_rules(cities, "NBA")

    # Loading NBA statistics data
    nba_stats = pd.read_csv("assets/nba.csv")

    # Cleaning the team names (removing unnecessary characters)
    def clean_nba_name(row): 
        # Removing everything after an asterisk (indicating additional info about teams)
        row = re.sub(r'\*.*', "", row).strip()
        # Removing everything inside parentheses (e.g., "(West)" or "(East)")
        row = re.sub(r"\(.*", "", row).strip()
        return row

    # Applying the cleaning function to the 'team' column
    nba_stats.loc[:, "team"] = nba_stats.loc[:, "team"].apply(lambda x: clean_nba_name(x))

    # Filtering the dataset to keep only the data for 2018
    mask2018 = nba_stats["year"] == 2018
    nba_stats = nba_stats[mask2018]

    # Keeping only the team name and their win/loss percentage
    nba_stats = nba_stats.loc[:, ["team", "W/L%"]]
    # Adding a new column for city names (initially empty)
    nba_stats["City"] = np.nan

    # We are now going to handle teams that have problematic city names
    easy_named_teams = []  # List of teams that we can match easily with the city data
    problematic_names = []  # List of teams with city names that are harder to match

    # Function to check if the team name matches any city name in the city data
    def assign_city(tname): 
        '''
        The thinking is quite elegant here to go through all the names really easy 
        1) Get the the team-name and find its first word. If it is a city then put it on the easy ones
        2) If not put it on the problematic

        Then 1) solves most and 2) will just be done manually (big saving on if we were to do it for 4 dataframes manually).
        Lastly we just go over it with the eye to see if there is anything we missed e.g. New York ~ New Jersey 
        '''
        if tname.split()[0] in str(metro_rules_NBA["City"].unique()):
            easy_named_teams.append(tname)
        else: 
            problematic_names.append(tname)

    # Applying this function to the 'team' column to assign cities
    nba_stats["team"].apply(lambda x: assign_city(x))

    # We create a mapping dictionary for teams with problematic names - this is the 2) of the function
    problematic_mapping = {
        "Brooklyn Nets": "New York City",
        "Golden State Warriors": "San Francisco Bay Area",
        "Utah Jazz": "Salt Lake City",
        "Minnesota Timberwolves": "Minneapolis–Saint Paul",
    }

    # Function to assign a city to a team based on the mapping dictionary
    # We will use it to map both the easy and the problematic then
    def team_rename(row, dictionary): 
        team_name = row["team"]  # Get the team name
        if team_name in dictionary.keys():  # If the team is in the mapping dictionary
            row["City"] = dictionary[team_name]  # Assign the corresponding city name
        return row

    # Applying the problematic mapping to the NBA stats
    nba_stats = nba_stats.apply(team_rename, axis=1, dictionary=problematic_mapping)

    # Handling the easy-to-rename teams (those with city names that match the first word of the team name)
    def find_city_by_team(tname):
        result = metro_rules_NBA[metro_rules_NBA["City"].str.startswith(tname.split()[0])]
        return result["City"].iloc[0]  # Return the city name that matches the team's first word

    easy_named_teams_dict = {}  # Dictionary to store city names for easy-to-match teams

    # Now for the easy - let's first make the mapping dictionary
    for team in easy_named_teams: 
        try: 
            easy_named_teams_dict[team] = find_city_by_team(team)
        except: 
            continue  # Skip if no match is found

    # Applying the city names for easy teams
    nba_stats = nba_stats.apply(team_rename, axis=1, dictionary=easy_named_teams_dict)

    # Correcting the names of teams with known mismatches (San Antonio and New Orleans)
    nba_stats.loc[20, "City"] = "New Orleans"
    nba_stats.loc[21, "City"] = "San Antonio"

    # Converting the 'W/L%' column to a float type for analysis
    nba_stats["W/L%"] = nba_stats["W/L%"].astype(float)

    # Calculating the average win/loss percentage by city
    mean_wl = nba_stats["W/L%"].groupby(nba_stats["City"]).mean()

    # Merging the NBA win/loss data with city population data based on the city column
    final = pd.merge(metro_rules_NBA, mean_wl, how="inner", on="City")

    # Calculating the Pearson correlation between city population and NBA win/loss percentage
    correlation = stats.pearsonr(final["Population"].astype(float), final["W/L%"].astype(float))

    # Returning the correlation value
    return correlation[0]


  cities.loc[:, sport] = cities.loc[:, sport].replace("\[.*\]", "", regex=True)


In [4]:
nba_correlation()

np.float64(-0.17636350642182935)