<h1>NBA correlation problem</h1>

In [None]:
def nba_correlation(): 
    import pandas as pd
    import numpy as np
    import scipy.stats as stats
    import re
    
    #Importing our data
    nba_df=pd.read_csv("assets/nba.csv")
    cities=pd.read_html("assets/wikipedia_data.html")[1]
    cities= cities.iloc[:-1,[0,3,5,6,7,8]]
    cities.head()

    #We will now perform a basic data clean-up process

    def clean_up_cities_rules(cities, sport): 
        #We rename the columns to easier names to work with
        cities.rename(columns = {"Population (2016 est.)[8]":"Population", "Metropolitan area":"City"}, inplace = True)
        #We keep only the relevant to the sport team
        cities = cities[["City","Population",sport]]
        #We replace the -- with NaN cause it's easier to handle
        cities.loc[:,sport] = cities.loc[:,sport].replace("—",np.nan)
        cities = cities.dropna()
        #We want to replace everything in [] with ""
        cities.loc[:,sport] = cities.loc[:,sport].replace("\[.*\]","",regex = True)
        #Anything that is empty at the sports column will be the ones that don't have this sport on the city
        cities.loc[:,sport] = cities.loc[:,sport].replace("",np.nan)
        cities = cities.dropna()
        return cities

    metro_rules_NBA = clean_up_cities_rules(cities,"NBA")

    #Now let's do a similar clean-up process for the NBA sheet
    #We will do it by hand first and maybe we'll write a function later 
    nba_stats = pd.read_csv("assets/nba.csv")

    #We see we have to clean the team names
    #We essentially have to clean everything after the * 
    def clean_nba_name(row): 
        row = re.sub(r'\*.*', "", row).strip() #Cleans everything after an asterisk
        row = re.sub(r"\(.*","",row).strip() #Cleans everything after a parenthesis
        return row

    nba_stats.loc[:,"team"] = nba_stats.loc[:,"team"].apply(lambda x: clean_nba_name(x))

    #We want to keep only 2018 data 
    mask2018 = nba_stats["year"] == 2018
    nba_stats = nba_stats[mask2018]

    
    #Okay now we have the 30 teams of the 2018 NBA 
    #From this table we only want the teams and their W/L
    nba_stats = nba_stats.loc[:,["team","W/L%"]]
    #We also want to make our city column
    nba_stats["City"] = np.nan

    #Let's do the same procedure
    #This is the function that will find the problematic team names 
    easy_named_teams = []
    problematic_names = []
    #This function can check if the city of the team in the nhl_stats is found in 
    #our cities column
            
            
    def assign_city(tname): 
        if tname.split()[0] in str(metro_rules_NBA["City"].unique()):
            easy_named_teams.append(tname)
        else: 
            problematic_names.append(tname)
    
    nba_stats["team"].apply(lambda x: assign_city(x))

    #These are our problematic ones 
    #Let's create a dictionary and quickly match them 
    problematic_mapping = {
    "Brooklyn Nets": "New York City",
    "Golden State Warriors": "San Francisco Bay Area",
    "Utah Jazz": "Salt Lake City",
    "Minnesota Timberwolves": "Minneapolis–Saint Paul",
    }

    def team_rename(row, dictionary): 
        team_name = row["team"] #Get the team name from the team column 
        if team_name in dictionary.keys(): #If it's in that mapping add the value in the City col
            row["City"] = dictionary[team_name]
        return row

    #We will now apply this with the dictionary = problematic_mapping 
    nba_stats = nba_stats.apply(team_rename, axis = 1, dictionary = problematic_mapping)

    #Beautiful let's now fix up our other teams as well. 
    def find_city_by_team(tname):
        result = metro_rules_NBA[metro_rules_NBA["City"].str.startswith(tname.split()[0])]
        #Get the City columns that start with the first word of my tname
        return result["City"].iloc[0]
    
    easy_named_teams_dict = {}
    
    for team in easy_named_teams: 
        try: easy_named_teams_dict[team] = find_city_by_team(team)
        except: continue

    nba_stats = nba_stats.apply(team_rename, axis = 1, dictionary = easy_named_teams_dict)
    #So our easy teams our now done as well
    #There is only one problem with this - it maps depending on the first word so let's just fix those weird ones
    #Correcting San Antonio and New Orleans 
    nba_stats.loc[20,"City"] = "New Orleans"
    nba_stats.loc[21,"City"] = "San Antonio"

    nba_stats["W/L%"] = nba_stats["W/L%"].astype(float)
    mean_wl = nba_stats["W/L%"].groupby(nba_stats["City"]).mean()
    final = pd.merge(metro_rules_NBA, mean_wl, how = "inner", on = "City")

    correlation = stats.pearsonr(final["Population"].astype(float), final["W/L%"].astype(float))

    return correlation[0]
    

In [24]:
nba_correlation()

-0.17636350642182938