### Import Dependencies

In [1]:
import warnings, os
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from pandas.errors import EmptyDataError 
import numpy as np
from datetime import date

### Read CSV file.

In [2]:
df = pd.read_csv("../scraped_csv/players_stats_seasonal/8466138_20202021_stats.csv")
df

Unnamed: 0,season,stat.timeOnIce,stat.assists,stat.goals,stat.pim,stat.shots,stat.games,stat.hits,stat.powerPlayGoals,stat.powerPlayPoints,...,stat.shortHandedPoints,stat.shortHandedTimeOnIce,stat.blocked,stat.plusMinus,stat.points,stat.shifts,stat.timeOnIcePerGame,stat.evenTimeOnIcePerGame,stat.shortHandedTimeOnIcePerGame,stat.powerPlayTimeOnIcePerGame
0,20202021,603:41,15,5,14,42,44,32,0,5,...,0,00:08,10,6,20,794,13:43,11:47,00:00,01:55


In [3]:
# get a list of all columns in list format

list(df.columns)

['season',
 'stat.timeOnIce',
 'stat.assists',
 'stat.goals',
 'stat.pim',
 'stat.shots',
 'stat.games',
 'stat.hits',
 'stat.powerPlayGoals',
 'stat.powerPlayPoints',
 'stat.powerPlayTimeOnIce',
 'stat.evenTimeOnIce',
 'stat.penaltyMinutes',
 'stat.faceOffPct',
 'stat.shotPct',
 'stat.gameWinningGoals',
 'stat.overTimeGoals',
 'stat.shortHandedGoals',
 'stat.shortHandedPoints',
 'stat.shortHandedTimeOnIce',
 'stat.blocked',
 'stat.plusMinus',
 'stat.points',
 'stat.shifts',
 'stat.timeOnIcePerGame',
 'stat.evenTimeOnIcePerGame',
 'stat.shortHandedTimeOnIcePerGame',
 'stat.powerPlayTimeOnIcePerGame']

### Let's transpose to see all the columns and values easier.

In [4]:
df = df.transpose()
df

Unnamed: 0,0
season,20202021
stat.timeOnIce,603:41
stat.assists,15
stat.goals,5
stat.pim,14
stat.shots,42
stat.games,44
stat.hits,32
stat.powerPlayGoals,0
stat.powerPlayPoints,5


### Looking at dataset, there are a couple of things we need to do:
- Delete the columns not fitting in the database schema
- Rename column names to align with database schema

So let's go ahead and do these tasks.

#### However, also note that we need to do this for every single team_stats dataset and then combine all those cleaned datasets into one dataset to fit the database schema. Thus, we will make a single function to accomplish all these tasks.

In [5]:
# making function

def cleaning_dataset(dataframe): 
    df = pd.read_csv(dataframe)

    # dropping useless columns
    dropped_columns = [ 
        'season',
        'stat.shotPct',
        'stat.shifts',
        'stat.pim',
    ]
    df = df.drop(columns=dropped_columns, errors='ignore')

    # renaming columns to more meaningful name
    renamed_columns = {
        'stat.timeOnIce':'timeOnIce',
        'stat.assists':'assists',
        'stat.goals':'goals',
        'stat.shots':'shots',
        'stat.games':'gamesPlayed',
        'stat.hits':'hits',
        'stat.powerPlayGoals':'powerPlayGoals',
        'stat.powerPlayPoints':'powerPlayPoints',
        'stat.powerPlayTimeOnIce':'powerPlayTimeOnIce',
        'stat.evenTimeOnIce':'evenTimeOnIce',
        'stat.penaltyMinutes':'penaltyMinutes',
        'stat.faceOffPct':'faceOffPct',
        'stat.gameWinningGoals':'gameWinningGoals',
        'stat.overTimeGoals':'overTimeGoals',
        'stat.shortHandedGoals':'shortHandedGoals',
        'stat.shortHandedPoints':'shortHandedPoints',
        'stat.shortHandedTimeOnIce':'shortHandedTimeOnIce',
        'stat.blocked':'shotsBlocked',
        'stat.plusMinus':'plusMinus',
        'stat.points':'points',
        'stat.timeOnIcePerGame':'timeOnIcePerGame',
        'stat.evenTimeOnIcePerGame':'evenTimeOnIcePerGame',
        'stat.shortHandedTimeOnIcePerGame':'shortHandedTimeOnIcePerGame',
        'stat.powerPlayTimeOnIcePerGame':'powerPlayTimeOnIcePerGame'
    }
    df = df.rename(columns=renamed_columns)

    return df.to_dict(orient='records')


In [6]:
x = cleaning_dataset('../scraped_csv/players_stats_seasonal/8466138_20202021_stats.csv')
x

[{'timeOnIce': '603:41',
  'assists': 15,
  'goals': 5,
  'shots': 42,
  'gamesPlayed': 44,
  'hits': 32,
  'powerPlayGoals': 0,
  'powerPlayPoints': 5,
  'powerPlayTimeOnIce': '85:03',
  'evenTimeOnIce': '518:30',
  'penaltyMinutes': 14,
  'faceOffPct': 57.35,
  'gameWinningGoals': 0,
  'overTimeGoals': 0,
  'shortHandedGoals': 0,
  'shortHandedPoints': 0,
  'shortHandedTimeOnIce': '00:08',
  'shotsBlocked': 10,
  'plusMinus': 6,
  'points': 20,
  'timeOnIcePerGame': '13:43',
  'evenTimeOnIcePerGame': '11:47',
  'shortHandedTimeOnIcePerGame': '00:00',
  'powerPlayTimeOnIcePerGame': '01:55'}]

### Now, we can make a loop to go through all of the stats datasets, apply the function to all of them, and then concatenate all the results into one csv file.

#### This will be done by looping through all the datasets such that the function:
- Applies modifications to each dataset
- Merges all the datasets together with their year number to fit database schema.

In [7]:
# get a list of all seasons
year = date.today().year
years = [2016 + x for x in range(year - 2016)]
seasons = [f'{years[i]}{years[i]+1}' for i in range(len(years))]
print(seasons)

# get list of all players
stats_datasets_players = list(map(lambda x: x.split("_")[0], os.listdir("../scraped_csv/players_details/")))
print(stats_datasets_players)

['20162017', '20172018', '20182019', '20192020', '20202021', '20212022', '20222023']
['8466138', '8466139', '8468493', '8468575', '8469455', '8469521', '8470147', '8470599', '8470611', '8470619', '8470966', '8471392', '8471436', '8471817', '8473415', '8473422', '8473463', '8473523', '8474037', '8474062', '8474162', '8474190', '8474567', '8474568', '8474581', '8474589', '8474636', '8474673', '8474709', '8474727', '8474818', '8474889', '8475098', '8475158', '8475160', '8475166', '8475172', '8475180', '8475197', '8475278', '8475714', '8475716', '8475717', '8475718', '8475786', '8475789', '8475844', '8475852', '8475857', '8475883', '8475906', '8476278', '8476289', '8476302', '8476329', '8476343', '8476406', '8476410', '8476474', '8476495', '8476851', '8476853', '8476857', '8476879', '8476899', '8476918', '8476931', '8476941', '8476979', '8477015', '8477018', '8477021', '8477149', '8477312', '8477341', '8477464', '8477472', '8477479', '8477503', '8477512', '8477939', '8477941', '8477953', '

In [8]:
# do this for every roster dataset
all_players_stats = {}
for player in stats_datasets_players:
    current_player_stats = {}
    for season in seasons:
        # print(f'{player}_{season}')
        if os.path.isfile(f"../scraped_csv/players_stats_seasonal/{player}_{season}_stats.csv"):
            try:
                current_player_stats[f'{season}'] = cleaning_dataset(f"../scraped_csv/players_stats_seasonal/{player}_{season}_stats.csv")[0]
            except EmptyDataError:
                current_player_stats[f'{season}'] = {'stats': 'no stats for this year'}
    all_players_stats[f'{player}'] = current_player_stats
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in all_players_stats.items() ]))
df

Unnamed: 0,8466138,8466139,8468493,8468575,8469455,8469521,8470147,8470599,8470611,8470619,...,8482222,8482241,8482247,8482259,8482634,8482720,8482815,8483183,8483489,8484158
20162017,,,,,,,"{'timeOnIce': '1134:39', 'stat.ot': 2, 'stat.s...","{'timeOnIce': '71:21', 'assists': 1, 'goals': ...","{'timeOnIce': '579:11', 'assists': 5, 'goals':...","{'timeOnIce': '977:29', 'assists': 12, 'goals'...",...,,,,,,,,,,
20172018,,"{'timeOnIce': '1393:00', 'assists': 20, 'goals...","{'timeOnIce': '1749:11', 'assists': 19, 'goals...","{'timeOnIce': '514:52', 'assists': 6, 'goals':...",,"{'timeOnIce': '1165:29', 'assists': 20, 'goals...","{'timeOnIce': '979:07', 'stat.ot': 1, 'stat.sh...",,"{'timeOnIce': '212:04', 'assists': 1, 'goals':...",,...,,,,,,,,,,
20182019,,"{'timeOnIce': '1340:15', 'assists': 21, 'goals...","{'timeOnIce': '1640:44', 'assists': 18, 'goals...",,,,,,,,...,,,,,,,,,,
20192020,,,,,"{'timeOnIce': '628:05', 'assists': 16, 'goals'...",,,,,,...,,,,,,,,,,
20202021,"{'timeOnIce': '603:41', 'assists': 15, 'goals'...",,,,"{'timeOnIce': '595:35', 'assists': 20, 'goals'...",,,,,,...,"{'timeOnIce': '270:52', 'assists': 5, 'goals':...",,"{'timeOnIce': '412:54', 'assists': 6, 'goals':...",,,,,,,
20212022,,,,,"{'timeOnIce': '762:58', 'assists': 13, 'goals'...",,,,,,...,,,,,"{'timeOnIce': '23:55', 'assists': 1, 'goals': ...",,"{'timeOnIce': '30:07', 'assists': 0, 'goals': ...",,,
20222023,,,,,,,,,,,...,,"{'timeOnIce': '85:45', 'assists': 0, 'goals': ...",,"{'timeOnIce': '105:43', 'assists': 1, 'goals':...","{'timeOnIce': '23:32', 'assists': 0, 'goals': ...","{'timeOnIce': '39:20', 'assists': 1, 'goals': ...",,"{'timeOnIce': '01:10', 'stat.ot': 0, 'stat.shu...",,


In [9]:
# replace NaN with 0
df = df.replace(np.NaN, 'No Stats')
df

Unnamed: 0,8466138,8466139,8468493,8468575,8469455,8469521,8470147,8470599,8470611,8470619,...,8482222,8482241,8482247,8482259,8482634,8482720,8482815,8483183,8483489,8484158
20162017,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '1134:39', 'stat.ot': 2, 'stat.s...","{'timeOnIce': '71:21', 'assists': 1, 'goals': ...","{'timeOnIce': '579:11', 'assists': 5, 'goals':...","{'timeOnIce': '977:29', 'assists': 12, 'goals'...",...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20172018,No Stats,"{'timeOnIce': '1393:00', 'assists': 20, 'goals...","{'timeOnIce': '1749:11', 'assists': 19, 'goals...","{'timeOnIce': '514:52', 'assists': 6, 'goals':...",No Stats,"{'timeOnIce': '1165:29', 'assists': 20, 'goals...","{'timeOnIce': '979:07', 'stat.ot': 1, 'stat.sh...",No Stats,"{'timeOnIce': '212:04', 'assists': 1, 'goals':...",No Stats,...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20182019,No Stats,"{'timeOnIce': '1340:15', 'assists': 21, 'goals...","{'timeOnIce': '1640:44', 'assists': 18, 'goals...",No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20192020,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '628:05', 'assists': 16, 'goals'...",No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20202021,"{'timeOnIce': '603:41', 'assists': 15, 'goals'...",No Stats,No Stats,No Stats,"{'timeOnIce': '595:35', 'assists': 20, 'goals'...",No Stats,No Stats,No Stats,No Stats,No Stats,...,"{'timeOnIce': '270:52', 'assists': 5, 'goals':...",No Stats,"{'timeOnIce': '412:54', 'assists': 6, 'goals':...",No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20212022,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '762:58', 'assists': 13, 'goals'...",No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '23:55', 'assists': 1, 'goals': ...",No Stats,"{'timeOnIce': '30:07', 'assists': 0, 'goals': ...",No Stats,No Stats,No Stats
20222023,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,"{'timeOnIce': '85:45', 'assists': 0, 'goals': ...",No Stats,"{'timeOnIce': '105:43', 'assists': 1, 'goals':...","{'timeOnIce': '23:32', 'assists': 0, 'goals': ...","{'timeOnIce': '39:20', 'assists': 1, 'goals': ...",No Stats,"{'timeOnIce': '01:10', 'stat.ot': 0, 'stat.shu...",No Stats,No Stats


### Lets finally export the cleansed dataframe into a csv file

In [10]:
title = '../cleaned_csv/playersAllStatsCleaned.csv'
df.to_csv(title, index=False)

In [11]:
# how to get csv back into dataframe

a = pd.read_csv('../cleaned_csv/playersAllStatsCleaned.csv')
b = a.to_dict(orient='list')
c = pd.DataFrame.from_dict(b, orient='columns')
rows = {
    0 : seasons[0],
    1 : seasons[1],
    2 : seasons[2],
    3 : seasons[3],
    4 : seasons[4],
    5 : seasons[5],
    6 : seasons[6],
}
c = c.rename(index=rows)
c

Unnamed: 0,8466138,8466139,8468493,8468575,8469455,8469521,8470147,8470599,8470611,8470619,...,8482222,8482241,8482247,8482259,8482634,8482720,8482815,8483183,8483489,8484158
20162017,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '1134:39', 'stat.ot': 2, 'stat.s...","{'timeOnIce': '71:21', 'assists': 1, 'goals': ...","{'timeOnIce': '579:11', 'assists': 5, 'goals':...","{'timeOnIce': '977:29', 'assists': 12, 'goals'...",...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20172018,No Stats,"{'timeOnIce': '1393:00', 'assists': 20, 'goals...","{'timeOnIce': '1749:11', 'assists': 19, 'goals...","{'timeOnIce': '514:52', 'assists': 6, 'goals':...",No Stats,"{'timeOnIce': '1165:29', 'assists': 20, 'goals...","{'timeOnIce': '979:07', 'stat.ot': 1, 'stat.sh...",No Stats,"{'timeOnIce': '212:04', 'assists': 1, 'goals':...",No Stats,...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20182019,No Stats,"{'timeOnIce': '1340:15', 'assists': 21, 'goals...","{'timeOnIce': '1640:44', 'assists': 18, 'goals...",No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20192020,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '628:05', 'assists': 16, 'goals'...",No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20202021,"{'timeOnIce': '603:41', 'assists': 15, 'goals'...",No Stats,No Stats,No Stats,"{'timeOnIce': '595:35', 'assists': 20, 'goals'...",No Stats,No Stats,No Stats,No Stats,No Stats,...,"{'timeOnIce': '270:52', 'assists': 5, 'goals':...",No Stats,"{'timeOnIce': '412:54', 'assists': 6, 'goals':...",No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats
20212022,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '762:58', 'assists': 13, 'goals'...",No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,No Stats,No Stats,No Stats,"{'timeOnIce': '23:55', 'assists': 1, 'goals': ...",No Stats,"{'timeOnIce': '30:07', 'assists': 0, 'goals': ...",No Stats,No Stats,No Stats
20222023,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,No Stats,...,No Stats,"{'timeOnIce': '85:45', 'assists': 0, 'goals': ...",No Stats,"{'timeOnIce': '105:43', 'assists': 1, 'goals':...","{'timeOnIce': '23:32', 'assists': 0, 'goals': ...","{'timeOnIce': '39:20', 'assists': 1, 'goals': ...",No Stats,"{'timeOnIce': '01:10', 'stat.ot': 0, 'stat.shu...",No Stats,No Stats
