### Import Dependencies

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import os

### Read CSV file.

In [2]:
df = pd.read_csv("../scraped_csv/players_details/8466138_player_details.csv")
df

Unnamed: 0,id,fullName,link,firstName,lastName,primaryNumber,birthDate,birthCity,birthStateProvince,birthCountry,...,height,weight,active,rookie,shootsCatches,rosterStatus,primaryPosition.code,primaryPosition.name,primaryPosition.type,primaryPosition.abbreviation
0,8466138,Joe Thornton,/api/v1/people/8466138,Joe,Thornton,19,1979-07-02,London,ON,CAN,...,"6' 4""",220,False,False,L,Y,C,Center,Forward,C


### Let's transpose to see all the columns and values easier.

In [3]:
df = df.transpose()
df

Unnamed: 0,0
id,8466138
fullName,Joe Thornton
link,/api/v1/people/8466138
firstName,Joe
lastName,Thornton
primaryNumber,19
birthDate,1979-07-02
birthCity,London
birthStateProvince,ON
birthCountry,CAN


### Looking at dataset, there are a couple of things we need to do:
- Delete the columns not fitting in the database schema
- Rename column names to align with database schema

So let's go ahead and do these tasks.

#### However, also note that we need to do this for every single team_stats dataset and then combine all those cleaned datasets into one dataset to fit the database schema. Thus, we will make a single function to accomplish all these tasks.

In [4]:
# making function

def cleaning_dataset(dataframe): 
    df = pd.read_csv(dataframe)

    # dropping useless columns
    dropped_columns = [ 
        'link', 
        'birthCity', 
        'birthStateProvince', 
        'birthCountry',
        'rookie',  
        'primaryPosition.code', 
        'primaryPosition.type', 
        'primaryPosition.abbreviation'
    ]
    df = df.drop(columns=dropped_columns, errors='ignore')

    # renaming columns to more meaningful name
    renamed_columns = {
        'id':'apiID', 
        'fullName': 'name', 
        'primaryNumber':'jerseyNumber',
        'primaryPosition.name': 'position',
    }
    df = df.rename(columns=renamed_columns)

    return df.to_dict(orient='records')


### Now, we can make a loop to go through all of the stats datasets, apply the function to all of them, and then concatenate all the results into one csv file.

#### This will be done by looping through all the datasets such that the function:
- Applies modifications to each dataset
- Merges all the datasets together with their year number to fit database schema.

In [5]:
# get list of all player datasets
details_datasets_players = list(map(lambda x: x.split("_")[0], os.listdir("../scraped_csv/players_details/")))
details_datasets_players

['8466138',
 '8466139',
 '8468493',
 '8468575',
 '8469455',
 '8469521',
 '8470147',
 '8470599',
 '8470611',
 '8470619',
 '8470966',
 '8471392',
 '8471436',
 '8471817',
 '8473415',
 '8473422',
 '8473463',
 '8473523',
 '8474037',
 '8474062',
 '8474162',
 '8474190',
 '8474567',
 '8474568',
 '8474581',
 '8474589',
 '8474636',
 '8474673',
 '8474709',
 '8474727',
 '8474818',
 '8474889',
 '8475098',
 '8475158',
 '8475160',
 '8475166',
 '8475172',
 '8475180',
 '8475197',
 '8475278',
 '8475714',
 '8475716',
 '8475717',
 '8475718',
 '8475786',
 '8475789',
 '8475844',
 '8475852',
 '8475857',
 '8475883',
 '8475906',
 '8476278',
 '8476289',
 '8476302',
 '8476329',
 '8476343',
 '8476406',
 '8476410',
 '8476474',
 '8476495',
 '8476851',
 '8476853',
 '8476857',
 '8476879',
 '8476899',
 '8476918',
 '8476931',
 '8476941',
 '8476979',
 '8477015',
 '8477018',
 '8477021',
 '8477149',
 '8477312',
 '8477341',
 '8477464',
 '8477472',
 '8477479',
 '8477503',
 '8477512',
 '8477939',
 '8477941',
 '8477953',
 '84

In [6]:
# do this for every roster dataset
all_players = {}
for player in details_datasets_players:
    all_players[f'{player}'] = cleaning_dataset(f"../scraped_csv/players_details/{player}_player_details.csv")
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in all_players.items() ]))
df

Unnamed: 0,8466138,8466139,8468493,8468575,8469455,8469521,8470147,8470599,8470611,8470619,...,8482222,8482241,8482247,8482259,8482634,8482720,8482815,8483183,8483489,8484158
0,"{'apiID': 8466138, 'name': 'Joe Thornton', 'fi...","{'apiID': 8466139, 'name': 'Patrick Marleau', ...","{'apiID': 8468493, 'name': 'Ron Hainsey', 'fir...","{'apiID': 8468575, 'name': 'Dominic Moore', 'f...","{'apiID': 8469455, 'name': 'Jason Spezza', 'fi...","{'apiID': 8469521, 'name': 'Tomas Plekanec', '...","{'apiID': 8470147, 'name': 'Curtis McElhinney'...","{'apiID': 8470599, 'name': 'Milan Michalek', '...","{'apiID': 8470611, 'name': 'Eric Fehr', 'first...","{'apiID': 8470619, 'name': 'Brian Boyle', 'fir...",...,"{'apiID': 8482222, 'name': 'Alexander Barabano...","{'apiID': 8482241, 'name': 'Radim Zohorna', 'f...","{'apiID': 8482247, 'name': 'Mikko Lehtonen', '...","{'apiID': 8482259, 'name': 'Bobby McMann', 'fi...","{'apiID': 8482634, 'name': 'Alex Steeves', 'fi...","{'apiID': 8482720, 'name': 'Matthew Knies', 'f...","{'apiID': 8482815, 'name': 'Kirill Semyonov', ...","{'apiID': 8483183, 'name': 'Jett Alexander', '...","{'apiID': 8483489, 'name': 'Fraser Minten', 'f...","{'apiID': 8484158, 'name': 'Easton Cowan', 'fi..."


### Lets finally export the cleansed dataframe into a csv file

In [7]:
title = '../cleaned_csv/playersAllDetailsCleaned.csv'
df.to_csv(title, index=False)

In [8]:
# how to get csv back into dataframe

a = pd.read_csv('../cleaned_csv/playersAllDetailsCleaned.csv')
b = a.to_dict(orient='list')
c = pd.DataFrame.from_dict(b, orient='columns')
c

Unnamed: 0,8466138,8466139,8468493,8468575,8469455,8469521,8470147,8470599,8470611,8470619,...,8482222,8482241,8482247,8482259,8482634,8482720,8482815,8483183,8483489,8484158
0,"{'apiID': 8466138, 'name': 'Joe Thornton', 'fi...","{'apiID': 8466139, 'name': 'Patrick Marleau', ...","{'apiID': 8468493, 'name': 'Ron Hainsey', 'fir...","{'apiID': 8468575, 'name': 'Dominic Moore', 'f...","{'apiID': 8469455, 'name': 'Jason Spezza', 'fi...","{'apiID': 8469521, 'name': 'Tomas Plekanec', '...","{'apiID': 8470147, 'name': 'Curtis McElhinney'...","{'apiID': 8470599, 'name': 'Milan Michalek', '...","{'apiID': 8470611, 'name': 'Eric Fehr', 'first...","{'apiID': 8470619, 'name': 'Brian Boyle', 'fir...",...,"{'apiID': 8482222, 'name': 'Alexander Barabano...","{'apiID': 8482241, 'name': 'Radim Zohorna', 'f...","{'apiID': 8482247, 'name': 'Mikko Lehtonen', '...","{'apiID': 8482259, 'name': 'Bobby McMann', 'fi...","{'apiID': 8482634, 'name': 'Alex Steeves', 'fi...","{'apiID': 8482720, 'name': 'Matthew Knies', 'f...","{'apiID': 8482815, 'name': 'Kirill Semyonov', ...","{'apiID': 8483183, 'name': 'Jett Alexander', '...","{'apiID': 8483489, 'name': 'Fraser Minten', 'f...","{'apiID': 8484158, 'name': 'Easton Cowan', 'fi..."
