In [7]:
%%capture
#RUN FIRST, installs a missing library
import sys
!{sys.executable} -m pip install lxml==4.4.1

In [8]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re

def nhl_correlation():
    
    nhl_df = pd.read_csv('../assignments/assignment4/assets/nhl.csv')
    cities=pd.read_html("../assignments/assignment4/assets/wikipedia_data.html")[1]
    cities=cities.iloc[:-1,[0,3,5,6,7,8]]
    
    # Clean nhl df
    def clean_nhl_df():

        # Select data from 2018 only
        nhl_df = nhl_df[nhl_df['year']==2018]

        nhl_df.drop(labels=[0, 9, 18, 26], inplace=True)

        # Eliminate * at the end of the team name
        def fix_team_name(row):
            pattern = '.+?\*'
            if bool(re.search(pattern, row['team'])):
                row['team'] = re.sub('\*', '', row['team']).strip()

            return row

        nhl_df = nhl_df.apply(lambda x: fix_team_name(x), axis='columns')

        # Get metropolitan area per team
        def get_metro_area(team):
            metropolitan_areas = {'New York City': ['New York Islanders', 'New York Rangers', 'New Jersey Devils'],
                                  'Los Angeles': ['Los Angeles Kings', 'Anaheim Ducks'],
                                  'San Francisco Bay Area': ['San Jose Sharks'],
                                  'Chicago': ['Chicago Blackhawks'],
                                  'Dallas–Fort Worth': ['Dallas Stars'],
                                  'Washington, D.C.': ['Washington Capitals'],
                                  'Philadelphia': ['Philadelphia Flyers'],
                                  'Boston': ['Boston Bruins'],
                                  'Minneapolis–Saint Paul': ['Minnesota Wild'],
                                  'Denver': ['Colorado Avalanche'],
                                  'Miami–Fort Lauderdale': ['Florida Panthers'],
                                  'Phoenix': ['Arizona Coyotes'],
                                  'Detroit': ['Detroit Red Wings'],
                                  'Toronto': ['Toronto Maple Leafs'],
                                  'Tampa Bay Area': ['Tampa Bay Lightning'],
                                  'Pittsburgh': ['Pittsburgh Penguins'],
                                  'St. Louis': ['St. Louis Blues'],
                                  'Nashville': ['Nashville Predators'],
                                  'Buffalo': ['Buffalo Sabres'],
                                  'Montreal': ['Montreal Canadiens'],
                                  'Vancouver': ['Vancouver Canucks'],
                                  'Columbus': ['Columbus Blue Jackets'],
                                  'Calgary': ['Calgary Flames'],
                                  'Ottawa': ['Ottawa Senators'],
                                  'Edmonton': ['Edmonton Oilers'],
                                  'Winnipeg': ['Winnipeg Jets'],
                                  'Las Vegas': ['Vegas Golden Knights'],
                                  'Raleigh': ['Carolina Hurricanes']}

            for m_area in metropolitan_areas:
                if team in metropolitan_areas[m_area]:
                    return m_area

        nhl_df['Metropolitan area'] = nhl_df['team'].apply(lambda x: get_metro_area(x))

        nhl_df = nhl_df.astype({'W': 'int64', 'L': 'int64'})

        return nhl_df

    # Adjust cities df and set Metro Areas as index
    def clean_cities_df():

        # Rename Population column
        cities.rename(columns={'Population (2016 est.)[8]': 'Population'}, inplace=True)

        # Change dtype for Population
        cities = cities.astype({'Population': 'int64'})

        cities.set_index('Metropolitan area', inplace=True)

        return cities['Population']
    
    # Clean nhl df
    nhl_df = clean_nhl_df()
    
    # Group by Metropolitan area
    nhl_groups = nhl_df.groupby('Metropolitan area').agg({'W': np.nansum, 'L': np.nansum})
    nhl_groups['Win/Loss ratio'] = nhl_df['W']/(nhl_df['W'] + nhl_df['L'])
    
    # Adjust cities df and set Metro Areas as index
    cities = clean_cities_df()
    
    # Join nhl and cities dfs
    resulting_df = pd.merge(left=cities, right=nhl_groups, how='inner', left_index=True, right_index=True)
    
    # raise NotImplementedError()
    
    population_by_region = resulting_df['Population'].to_list() # pass in metropolitan area population from cities
    win_loss_by_region = resulting_df['Win/Loss ratio'].to_list() # pass in win/loss ratio from nhl_df in the same order as cities["Metropolitan area"]

    assert len(population_by_region) == len(win_loss_by_region), "Q1: Your lists must be the same length"
    assert len(population_by_region) == 28, "Q1: There should be 28 teams being analysed for NHL"
    
    return stats.pearsonr(population_by_region, win_loss_by_region)

In [9]:
print(nhl_correlation())

ImportError: lxml not found, please install it

In [None]:
!pip install lxml