### Import Dependencies

In [135]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import os

### Read CSV file.

In [136]:
df = pd.read_csv("../scraped_csv/team_stats/20162017_team_stats.csv")
df

Unnamed: 0,stat.gamesPlayed,stat.wins,stat.losses,stat.ot,stat.pts,stat.ptPctg,stat.goalsPerGame,stat.goalsAgainstPerGame,stat.evGGARatio,stat.powerPlayPercentage,...,stat.faceOffsLost,stat.faceOffWinPercentage,stat.shootingPctg,stat.savePctg,team.id,team.name,team.link,stat.penaltyKillOpportunities,stat.savePctRank,stat.shootingPctRank
0,82.0,40,27,15,95,57.9,3.049,2.854,1.0497,23.8,...,2537.0,49.9,9.6,0.912,10,Toronto Maple Leafs,/api/v1/teams/10,,,
1,,18th,9th,1st,13th,13th,5th,22nd,13th,2nd,...,23rd,15th,,,10,Toronto Maple Leafs,/api/v1/teams/10,19th,11th,9th


### Let's transpose to see all the columns and values easier.

In [137]:
df = df.transpose()
df

Unnamed: 0,0,1
stat.gamesPlayed,82.0,
stat.wins,40,18th
stat.losses,27,9th
stat.ot,15,1st
stat.pts,95,13th
stat.ptPctg,57.9,13th
stat.goalsPerGame,3.049,5th
stat.goalsAgainstPerGame,2.854,22nd
stat.evGGARatio,1.0497,13th
stat.powerPlayPercentage,23.8,2nd


### Looking at dataset, there are a couple of things we need to do:
- Remove the .0 at the end of the jersey numbers
- Delete the columns not fitting in the database schema
- Rename column names to align with database schema
- Add the `stats.savePctRank` and the `stat.shootingPctRank` in the rankings for the columns `stat.shootingPctg` and `stat.savePctg`

So let's go ahead and do these tasks.

#### However, also note that we need to do this for every single team_stats dataset and then combine all those cleaned datasets into one dataset to fit the database schema. Thus, we will make a single function to accomplish all these tasks.

In [138]:
# making function

def cleaning_dataset(dataframe): 
    df = pd.read_csv(dataframe)
    
    # converting all NaN values to 0
    # then removing .0 at end of gamesPlayed stat by changing type from float to int
    df = df.replace(np.NaN, 0)
    df = df.astype({'stat.gamesPlayed':np.int64})

    # Adding the `stats.savePctRank` and the `stat.shootingPctRank` in the rankings 
    # for the columns `stat.shootingPctg` and `stat.savePctg`
    # df.loc[1,'stat.shootingPctg'] = int(df['stat.shootingPctRank'][1].replace('th', ''))
    df.loc[1,'stat.shootingPctg'] = df['stat.shootingPctRank'][1]
    # df.loc[1,'stat.savePctg'] = int(df['stat.savePctRank'][1].replace('th', ''))
    df.loc[1,'stat.savePctg'] = df['stat.savePctRank'][1]

    # dropping useless columns
    dropped_columns = [
        'stat.evGGARatio', 
        'team.id', 
        'team.name', 
        'team.link', 
        'stat.penaltyKillOpportunities',
        'stat.savePctRank', 
        'stat.shootingPctRank'
    ]
    df = df.drop(columns=dropped_columns)

    # renaming columns to more meaningful name
    renamed_columns = {
        'stat.gamesPlayed':'gamesPlayed', 
        'stat.wins': 'gamesWon', 
        'stat.losses': 'gamesLost', 
        'stat.ot': 'gamesOT', 
        'stat.pts': 'points',
        'stat.ptPctg': 'pointsPct', 
        'stat.goalsPerGame': 'goalsPerGame', 
        'stat.goalsAgainstPerGame': 'goalsAgainstPerGame',
        'stat.powerPlayPercentage': 'powerPlayPct', 
        'stat.powerPlayGoals' : 'powerPlayGoals',
        'stat.powerPlayGoalsAgainst': 'powerPlayGoalsAgainst', 
        'stat.powerPlayOpportunities': 'powerPlayOpportunities',
        'stat.penaltyKillPercentage': 'penaltyKillPercentage', 
        'stat.penaltyKillOpportunities': 'penaltyKillOpportunities',
        'stat.shotsPerGame': 'shotsPerGame', 
        'stat.shotsAllowed': 'shotsAllowed',
        'stat.winScoreFirst': 'winWhenScoreFirst', 
        'stat.winOppScoreFirst': 'winWhenOppScoreFirst', 
        'stat.winLeadFirstPer': 'winWhenLeadingFirstPer',
        'stat.winLeadSecondPer': 'winWhenLeadingSecondPer', 
        'stat.winOutshootOpp': 'winWhenOutshootingOpp', 
        'stat.winOutshotByOpp': 'winWhenOutshotByOpp',
        'stat.faceOffsTaken': 'faceOffsTaken', 
        'stat.faceOffsWon': 'faceOffsWon', 
        'stat.faceOffsLost': 'faceOffsLost',
        'stat.faceOffWinPercentage': 'faceOffWinPercentage', 
        'stat.shootingPctg': 'shootingPctg', 
        'stat.savePctg': 'savePctg',
    }
    df = df.rename(columns=renamed_columns)

    # making final result fit database schema
    stats = {
        'stats': df.to_dict(orient='records')[0],
        'ranks': df.to_dict(orient='records')[1]
    }

    return stats


### Now, we can make a loop to go through all of the stats datasets, apply the function to all of them, and then concatenate all the results into one csv file.

#### This will be done by looping through all the datasets such that the function:
- Applies modifications to each dataset
- Merges all the datasets together with their year number to fit database schema.

In [139]:
# get list of all team stats datasets
stats_datasets_seasons = list(map(lambda x: x.split("_")[0], os.listdir("../scraped_csv/team_stats/")))
stats_datasets_seasons

['20162017',
 '20172018',
 '20182019',
 '20192020',
 '20202021',
 '20212022',
 '20222023',
 '20232024']

In [140]:
# do this for every roster dataset
all_stats = {}
for season in stats_datasets_seasons:
    all_stats[f'{season}'] = cleaning_dataset(f"../scraped_csv/team_stats/{season}_team_stats.csv")
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in all_stats.items() ]))
df

Unnamed: 0,20162017,20172018,20182019,20192020,20202021,20212022,20222023,20232024
stats,"{'gamesPlayed': 82, 'gamesWon': '40', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '49', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '46', 'gamesLo...","{'gamesPlayed': 70, 'gamesWon': '36', 'gamesLo...","{'gamesPlayed': 56, 'gamesWon': '35', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '54', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '50', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '0', 'gamesLost..."
ranks,"{'gamesPlayed': 0, 'gamesWon': '18th', 'gamesL...","{'gamesPlayed': 0, 'gamesWon': '6th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '10th', 'gamesL...","{'gamesPlayed': 0, 'gamesWon': '14th', 'gamesL...","{'gamesPlayed': 0, 'gamesWon': '8th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '4th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '6th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '28th', 'gamesL..."


### Lets finally export the cleansed dataframe into a csv file

In [141]:
title = '../cleaned_csv/teamAllStatsDetailsCleaned.csv'
df.to_csv(title, index=False)

In [142]:
# how to get csv back into dataframe

a = pd.read_csv('../cleaned_csv/teamAllStatsDetailsCleaned.csv')
b = a.to_dict(orient='list')
c = pd.DataFrame.from_dict(b, orient='columns')
c = c.rename(index={0:'stats', 1:'ranks'})
c

Unnamed: 0,20162017,20172018,20182019,20192020,20202021,20212022,20222023,20232024
stats,"{'gamesPlayed': 82, 'gamesWon': '40', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '49', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '46', 'gamesLo...","{'gamesPlayed': 70, 'gamesWon': '36', 'gamesLo...","{'gamesPlayed': 56, 'gamesWon': '35', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '54', 'gamesLo...","{'gamesPlayed': 82, 'gamesWon': '50', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '0', 'gamesLost..."
ranks,"{'gamesPlayed': 0, 'gamesWon': '18th', 'gamesL...","{'gamesPlayed': 0, 'gamesWon': '6th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '10th', 'gamesL...","{'gamesPlayed': 0, 'gamesWon': '14th', 'gamesL...","{'gamesPlayed': 0, 'gamesWon': '8th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '4th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '6th', 'gamesLo...","{'gamesPlayed': 0, 'gamesWon': '28th', 'gamesL..."
