### Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import os, csv, ast

### Read CSV file.

In [2]:
df = pd.read_csv("../scraped_csv/team_rosters/20182019_team_roster.csv")
df

Unnamed: 0,team_id,season,jerseyNumber,person.id,person.fullName,person.link,position.code,position.name,position.type,position.abbreviation
0,1,2018,11.0,8470619,Brian Boyle,/api/v1/people/8470619,C,Center,Forward,C
1,1,2018,18.0,8471226,Drew Stafford,/api/v1/people/8471226,R,Right Wing,Forward,RW
2,1,2018,14.0,8471233,Travis Zajac,/api/v1/people/8471233,C,Center,Forward,C
3,1,2018,4.0,8472382,Andy Greene,/api/v1/people/8472382,D,Defenseman,Defenseman,D
4,1,2018,2.0,8473468,Eric Gryba,/api/v1/people/8473468,D,Defenseman,Defenseman,D
...,...,...,...,...,...,...,...,...,...,...
1094,54,2018,89.0,8477949,Alex Tuch,/api/v1/people/8477949,R,Right Wing,Forward,RW
1095,54,2018,2.0,8481486,Jimmy Schuldt,/api/v1/people/8481486,D,Defenseman,Defenseman,D
1096,54,2018,29.0,8470594,Marc-Andre Fleury,/api/v1/people/8470594,G,Goalie,Goalie,G
1097,54,2018,33.0,8476509,Maxime Lagace,/api/v1/people/8476509,G,Goalie,Goalie,G


### Looking at dataset, there are a couple of things we need to do:
- Find and remove null values
- Remove the .0 at the end of the jersey numbers
- Delete the columns not fitting in the database schema
- Rename column names to align with database schema

So let's go ahead and do these tasks.

#### Find and remove null values

In [3]:
# find null values
df.isna().sum()

team_id                   0
season                    0
jerseyNumber             13
person.id                 0
person.fullName           0
person.link               0
position.code             0
position.name             0
position.type             0
position.abbreviation     0
dtype: int64

In [4]:
df[df['jerseyNumber'].isna()]

Unnamed: 0,team_id,season,jerseyNumber,person.id,person.fullName,person.link,position.code,position.name,position.type,position.abbreviation
21,1,2018,,8477463,Steven Santini,/api/v1/people/8477463,D,Defenseman,Defenseman,D
25,1,2018,,8477972,Josh Jacobs,/api/v1/people/8477972,D,Defenseman,Defenseman,D
107,4,2018,,8474161,Jakub Voracek,/api/v1/people/8474161,R,Right Wing,Forward,RW
232,7,2018,,8477461,Remi Elie,/api/v1/people/8477461,L,Left Wing,Forward,LW
320,9,2018,,8479366,Logan Brown,/api/v1/people/8479366,C,Center,Forward,C
329,9,2018,,8475195,Anders Nilsson,/api/v1/people/8475195,G,Goalie,Goalie,G
494,16,2018,,8470607,Brent Seabrook,/api/v1/people/8470607,D,Defenseman,Defenseman,D
551,17,2018,,8477994,Dominic Turgeon,/api/v1/people/8477994,C,Center,Forward,C
553,17,2018,,8478176,Joe Hicketts,/api/v1/people/8478176,D,Defenseman,Defenseman,D
704,22,2018,,8474520,Jason Garrison,/api/v1/people/8474520,D,Defenseman,Defenseman,D


Not significant amount of null values, thus it is acceptable to use the `dropna()` method.

In [5]:
df = df.dropna()
df.isna().sum()

team_id                  0
season                   0
jerseyNumber             0
person.id                0
person.fullName          0
person.link              0
position.code            0
position.name            0
position.type            0
position.abbreviation    0
dtype: int64

In [6]:
df[df['jerseyNumber'].isna()]

Unnamed: 0,team_id,season,jerseyNumber,person.id,person.fullName,person.link,position.code,position.name,position.type,position.abbreviation


#### Remove the .0 at the end of jersey numbers

In [7]:
# Removing the .0 at the end of the jersey numbers
df['jerseyNumber'].dtype

dtype('float64')

The .0 exists as it is a float. We need to change the type to int.

In [8]:
df = df.astype({'jerseyNumber':np.int64})

#### Delete the columns not fitting in the database schema

In [9]:
df

Unnamed: 0,team_id,season,jerseyNumber,person.id,person.fullName,person.link,position.code,position.name,position.type,position.abbreviation
0,1,2018,11,8470619,Brian Boyle,/api/v1/people/8470619,C,Center,Forward,C
1,1,2018,18,8471226,Drew Stafford,/api/v1/people/8471226,R,Right Wing,Forward,RW
2,1,2018,14,8471233,Travis Zajac,/api/v1/people/8471233,C,Center,Forward,C
3,1,2018,4,8472382,Andy Greene,/api/v1/people/8472382,D,Defenseman,Defenseman,D
4,1,2018,2,8473468,Eric Gryba,/api/v1/people/8473468,D,Defenseman,Defenseman,D
...,...,...,...,...,...,...,...,...,...,...
1094,54,2018,89,8477949,Alex Tuch,/api/v1/people/8477949,R,Right Wing,Forward,RW
1095,54,2018,2,8481486,Jimmy Schuldt,/api/v1/people/8481486,D,Defenseman,Defenseman,D
1096,54,2018,29,8470594,Marc-Andre Fleury,/api/v1/people/8470594,G,Goalie,Goalie,G
1097,54,2018,33,8476509,Maxime Lagace,/api/v1/people/8476509,G,Goalie,Goalie,G


In [10]:
df.columns

Index(['team_id', 'season', 'jerseyNumber', 'person.id', 'person.fullName',
       'person.link', 'position.code', 'position.name', 'position.type',
       'position.abbreviation'],
      dtype='object')

In [11]:
dropped_columns = [
    'person.link', 
    'position.code', 
]
df = df.drop(columns=dropped_columns)
df

Unnamed: 0,team_id,season,jerseyNumber,person.id,person.fullName,position.name,position.type,position.abbreviation
0,1,2018,11,8470619,Brian Boyle,Center,Forward,C
1,1,2018,18,8471226,Drew Stafford,Right Wing,Forward,RW
2,1,2018,14,8471233,Travis Zajac,Center,Forward,C
3,1,2018,4,8472382,Andy Greene,Defenseman,Defenseman,D
4,1,2018,2,8473468,Eric Gryba,Defenseman,Defenseman,D
...,...,...,...,...,...,...,...,...
1094,54,2018,89,8477949,Alex Tuch,Right Wing,Forward,RW
1095,54,2018,2,8481486,Jimmy Schuldt,Defenseman,Defenseman,D
1096,54,2018,29,8470594,Marc-Andre Fleury,Goalie,Goalie,G
1097,54,2018,33,8476509,Maxime Lagace,Goalie,Goalie,G


In [12]:
list(df.columns)

['team_id',
 'season',
 'jerseyNumber',
 'person.id',
 'person.fullName',
 'position.name',
 'position.type',
 'position.abbreviation']

In [13]:
df = df[[
    'team_id',
    'season',
    'person.id',
    'person.fullName',
    'jerseyNumber',
    'position.name',
    'position.abbreviation',
    'position.type',
]]

df

Unnamed: 0,team_id,season,person.id,person.fullName,jerseyNumber,position.name,position.abbreviation,position.type
0,1,2018,8470619,Brian Boyle,11,Center,C,Forward
1,1,2018,8471226,Drew Stafford,18,Right Wing,RW,Forward
2,1,2018,8471233,Travis Zajac,14,Center,C,Forward
3,1,2018,8472382,Andy Greene,4,Defenseman,D,Defenseman
4,1,2018,8473468,Eric Gryba,2,Defenseman,D,Defenseman
...,...,...,...,...,...,...,...,...
1094,54,2018,8477949,Alex Tuch,89,Right Wing,RW,Forward
1095,54,2018,8481486,Jimmy Schuldt,2,Defenseman,D,Defenseman
1096,54,2018,8470594,Marc-Andre Fleury,29,Goalie,G,Goalie
1097,54,2018,8476509,Maxime Lagace,33,Goalie,G,Goalie


#### Rename column names to align with database schema

In [14]:
renamed_columns = {
    'jerseyNumber': 'jersey_number', 
    'person.id': 'player_id',
    'person.fullName': 'full_name', 
    'position.name': 'position', 
    'position.abbreviation': 'position_code',
    'position.type': 'role'

}
df = df.rename(columns=renamed_columns)
df

Unnamed: 0,team_id,season,player_id,full_name,jersey_number,position,position_code,role
0,1,2018,8470619,Brian Boyle,11,Center,C,Forward
1,1,2018,8471226,Drew Stafford,18,Right Wing,RW,Forward
2,1,2018,8471233,Travis Zajac,14,Center,C,Forward
3,1,2018,8472382,Andy Greene,4,Defenseman,D,Defenseman
4,1,2018,8473468,Eric Gryba,2,Defenseman,D,Defenseman
...,...,...,...,...,...,...,...,...
1094,54,2018,8477949,Alex Tuch,89,Right Wing,RW,Forward
1095,54,2018,8481486,Jimmy Schuldt,2,Defenseman,D,Defenseman
1096,54,2018,8470594,Marc-Andre Fleury,29,Goalie,G,Goalie
1097,54,2018,8476509,Maxime Lagace,33,Goalie,G,Goalie


### Creating extra columns

To make querying easier, we want to create two extra fields: 
- First Name
- Last Name

We will achieve this by:
- Getting full name list from dataframe
- Splitting full name into two lists: `firstName` and `lastName`
- Assigning the two lists as columns in the dataframe

Let's get started!

#### Getting full name list from dataframe

In [15]:
fullName = df['full_name'].tolist()
fullName

['Brian Boyle',
 'Drew Stafford',
 'Travis Zajac',
 'Andy Greene',
 'Eric Gryba',
 'Ben Lovejoy',
 'Eric Tangradi',
 'Marcus Johansson',
 'Kyle Palmieri',
 'Sami Vatanen',
 'Taylor Hall',
 'Kenny Agostino',
 'Blake Pietila',
 'Blake Coleman',
 'Ryan Murphy',
 'Stefan Noesen',
 'Kurtis Gabriel',
 'Damon Severson',
 'Connor Carrick',
 'Will Butcher',
 'Miles Wood',
 'Mirco Mueller',
 'Jean-Sebastien Dea',
 'John Quenneville',
 'Pavel Zacha',
 'Colton White',
 'Brett Seney',
 'Nick Lappin',
 'Kevin Rooney',
 'Joey Anderson',
 'Brandon Gignac',
 'Jesper Bratt',
 'Nathan Bastian',
 'Michael McLeod',
 'Nico Hischier',
 'Egor Yakovlev',
 'Cory Schneider',
 'Keith Kinkaid',
 'Mackenzie Blackwood',
 'Valtteri Filppula',
 'Johnny Boychuk',
 'Andrew Ladd',
 'Stephen Gionta',
 'Leo Komarov',
 'Cal Clutterbuck',
 'Thomas Hickey',
 'Josh Bailey',
 'Luca Sbisa',
 'Jordan Eberle',
 'Matt Martin',
 'Nick Leddy',
 'Casey Cizikas',
 'Anders Lee',
 'Brock Nelson',
 'Tom Kuhnhackl',
 'Scott Mayfield',
 'Ad

#### Splitting full name into two lists: `firstName` and `lastName`

In [16]:
# making the two lists
firstName = list(map(lambda x: x.split(" ")[0], fullName))
print(firstName)
lastName = list(map(lambda x: x.split(" ")[1], fullName))
print(lastName)

['Brian', 'Drew', 'Travis', 'Andy', 'Eric', 'Ben', 'Eric', 'Marcus', 'Kyle', 'Sami', 'Taylor', 'Kenny', 'Blake', 'Blake', 'Ryan', 'Stefan', 'Kurtis', 'Damon', 'Connor', 'Will', 'Miles', 'Mirco', 'Jean-Sebastien', 'John', 'Pavel', 'Colton', 'Brett', 'Nick', 'Kevin', 'Joey', 'Brandon', 'Jesper', 'Nathan', 'Michael', 'Nico', 'Egor', 'Cory', 'Keith', 'Mackenzie', 'Valtteri', 'Johnny', 'Andrew', 'Stephen', 'Leo', 'Cal', 'Thomas', 'Josh', 'Luca', 'Jordan', 'Matt', 'Nick', 'Casey', 'Anders', 'Brock', 'Tom', 'Scott', 'Adam', 'Ryan', 'Ross', 'Michael', 'Joshua', 'Devon', 'Mathew', 'Anthony', 'Tanner', 'Thomas', 'Robin', 'Christopher', 'Cody', 'Marc', 'Adam', 'Matt', 'Kevin', 'Brendan', 'Chris', 'Mats', 'Ryan', 'Connor', 'Kevin', 'Jesper', 'Fredrik', 'Steven', 'Ryan', 'Mika', 'Vladislav', 'Brady', 'Jimmy', 'Boo', 'John', 'Pavel', 'Tony', 'Brendan', 'Ryan', 'Libor', 'Brett', 'Tim', 'Vinni', 'Lias', 'Filip', 'Neal', 'Henrik', 'Alexandar', 'Claude', 'Andrew', 'James', 'Wayne', 'Jori', 'Dale', 'Corb

In [17]:
# assigning the two lists as columns in the dataframe
df.insert(loc=4, column='first_name', value=firstName)
df.insert(loc=5, column='last_name', value=lastName)
df

Unnamed: 0,team_id,season,player_id,full_name,first_name,last_name,jersey_number,position,position_code,role
0,1,2018,8470619,Brian Boyle,Brian,Boyle,11,Center,C,Forward
1,1,2018,8471226,Drew Stafford,Drew,Stafford,18,Right Wing,RW,Forward
2,1,2018,8471233,Travis Zajac,Travis,Zajac,14,Center,C,Forward
3,1,2018,8472382,Andy Greene,Andy,Greene,4,Defenseman,D,Defenseman
4,1,2018,8473468,Eric Gryba,Eric,Gryba,2,Defenseman,D,Defenseman
...,...,...,...,...,...,...,...,...,...,...
1094,54,2018,8477949,Alex Tuch,Alex,Tuch,89,Right Wing,RW,Forward
1095,54,2018,8481486,Jimmy Schuldt,Jimmy,Schuldt,2,Defenseman,D,Defenseman
1096,54,2018,8470594,Marc-Andre Fleury,Marc-Andre,Fleury,29,Goalie,G,Goalie
1097,54,2018,8476509,Maxime Lagace,Maxime,Lagace,33,Goalie,G,Goalie


### Lets check if the csv shows up as we want to in the format it will be inserted into the database in

In [18]:
payload = df.to_dict(orient='records')
payload

[{'team_id': 1,
  'season': 2018,
  'player_id': 8470619,
  'full_name': 'Brian Boyle',
  'first_name': 'Brian',
  'last_name': 'Boyle',
  'jersey_number': 11,
  'position': 'Center',
  'position_code': 'C',
  'role': 'Forward'},
 {'team_id': 1,
  'season': 2018,
  'player_id': 8471226,
  'full_name': 'Drew Stafford',
  'first_name': 'Drew',
  'last_name': 'Stafford',
  'jersey_number': 18,
  'position': 'Right Wing',
  'position_code': 'RW',
  'role': 'Forward'},
 {'team_id': 1,
  'season': 2018,
  'player_id': 8471233,
  'full_name': 'Travis Zajac',
  'first_name': 'Travis',
  'last_name': 'Zajac',
  'jersey_number': 14,
  'position': 'Center',
  'position_code': 'C',
  'role': 'Forward'},
 {'team_id': 1,
  'season': 2018,
  'player_id': 8472382,
  'full_name': 'Andy Greene',
  'first_name': 'Andy',
  'last_name': 'Greene',
  'jersey_number': 4,
  'position': 'Defenseman',
  'position_code': 'D',
  'role': 'Defenseman'},
 {'team_id': 1,
  'season': 2018,
  'player_id': 8473468,
  'fu

Clearly it does.

In [58]:
len(df['full_name'].unique())

991

In [59]:
role_df = df[['team_id', 'player_id', 'full_name', 'role']].drop_duplicates(['full_name'], keep=False) 
role_df

Unnamed: 0,team_id,player_id,full_name,role
1,1,8471226,Drew Stafford,Forward
2,1,8471233,Travis Zajac,Forward
3,1,8472382,Andy Greene,Defenseman
4,1,8473468,Eric Gryba,Defenseman
6,1,8474025,Eric Tangradi,Forward
...,...,...,...,...
1094,54,8477949,Alex Tuch,Forward
1095,54,8481486,Jimmy Schuldt,Defenseman
1096,54,8470594,Marc-Andre Fleury,Goalie
1097,54,8476509,Maxime Lagace,Goalie


### Now, we need to apply the above modifications to all the other roster datasets and then combine them all into one csv file to fit the database schema.

#### This will be done by making a general function and then looping through it to connect all the datasets that:
- Applies modifications to each dataset
- Merges all the datasets together with their year number to fit database schema.

In [59]:
# making function

def cleaning_dataset(dataframe): 
    df = pd.read_csv(dataframe)

    df = df.dropna()
    
    df = df.astype({'jerseyNumber':np.int64})

    dropped_columns = [
        'person.link', 
        'position.code', 
    ]
    df = df.drop(columns=dropped_columns)
    df = df[[
        'team_id',
        'season',
        'person.id',
        'person.fullName',
        'jerseyNumber',
        'position.name',
        'position.abbreviation',
        'position.type',
    ]]

    renamed_columns = {
        'jerseyNumber': 'jersey_number', 
        'person.id': 'player_id',
        'person.fullName': 'full_name', 
        'position.name': 'position', 
        'position.abbreviation': 'position_code',
        'position.type': 'role'

    }
    df = df.rename(columns=renamed_columns)

    fullName = df['full_name'].tolist()
    firstName = list(map(lambda x: x.split(" ")[0], fullName))
    lastName = list(map(lambda x: x.split(" ")[1], fullName))
    df.insert(loc=4, column='first_name', value=firstName)
    df.insert(loc=5, column='last_name', value=lastName)

    role_df = df[['team_id', 'player_id', 'full_name', 'role']].drop_duplicates(['player_id'], keep='first') 
    df.drop(columns=['role'], inplace=True)
    return df, role_df



In [60]:
# get list of all roster datasets
roster_datasets_seasons = list(map(lambda x: x.split("_")[0], os.listdir("../scraped_csv/team_rosters/")))
roster_datasets_seasons

['20162017',
 '20172018',
 '20182019',
 '20192020',
 '20202021',
 '20212022',
 '20222023',
 '20232024']

In [61]:
# do this for every roster dataset
all_rosters = pd.DataFrame()
all_rosters_roles = pd.DataFrame()
for season in roster_datasets_seasons:
    df_temp, role_df_temp = cleaning_dataset(f"../scraped_csv/team_rosters/{season}_team_roster.csv")
    all_rosters = pd.concat([all_rosters, df_temp], ignore_index=True)
    all_rosters_roles = pd.concat([all_rosters_roles, role_df_temp], ignore_index=True)

In [62]:
all_rosters

Unnamed: 0,team_id,season,player_id,full_name,first_name,last_name,jersey_number,position,position_code
0,1,2016,8469500,Michael Cammalleri,Michael,Cammalleri,13,Left Wing,LW
1,1,2016,8469707,PA Parenteau,PA,Parenteau,11,Right Wing,RW
2,1,2016,8469759,Vernon Fiddler,Vernon,Fiddler,83,Center,C
3,1,2016,8470724,Kyle Quincey,Kyle,Quincey,27,Defenseman,D
4,1,2016,8471233,Travis Zajac,Travis,Zajac,14,Center,C
...,...,...,...,...,...,...,...,...,...
8385,55,2023,8478916,Joey Daccord,Joey,Daccord,35,Goalie,G
8386,55,2023,8479977,Kailer Yamamoto,Kailer,Yamamoto,56,Right Wing,RW
8387,55,2023,8480009,Eeli Tolvanen,Eeli,Tolvanen,20,Right Wing,RW
8388,55,2023,8481789,Tye Kartye,Tye,Kartye,52,Left Wing,LW


In [63]:
all_rosters_roles

Unnamed: 0,team_id,player_id,full_name,role
0,1,8469500,Michael Cammalleri,Forward
1,1,8469707,PA Parenteau,Forward
2,1,8469759,Vernon Fiddler,Forward
3,1,8470724,Kyle Quincey,Defenseman
4,1,8471233,Travis Zajac,Forward
...,...,...,...,...
7818,55,8478916,Joey Daccord,Goalie
7819,55,8479977,Kailer Yamamoto,Forward
7820,55,8480009,Eeli Tolvanen,Forward
7821,55,8481789,Tye Kartye,Forward


In [64]:
unique_list = all_rosters_roles['player_id'].unique()

In [65]:
all_rosters_roles_wo_duplicates = all_rosters_roles.drop_duplicates(['player_id'])
all_rosters_roles_wo_duplicates

Unnamed: 0,team_id,player_id,full_name,role
0,1,8469500,Michael Cammalleri,Forward
1,1,8469707,PA Parenteau,Forward
2,1,8469759,Vernon Fiddler,Forward
3,1,8470724,Kyle Quincey,Defenseman
4,1,8471233,Travis Zajac,Forward
...,...,...,...,...
7675,28,8480834,Ty Emberson,Defenseman
7699,29,8481716,Dmitri Voronkov,Forward
7703,29,8484166,Adam Fantilli,Forward
7726,30,8482094,Daemon Hunt,Defenseman


In [66]:
all_rosters_roles_wo_duplicates.isna().sum()

team_id      0
player_id    0
full_name    0
role         0
dtype: int64

In [67]:
len(all_rosters_roles_wo_duplicates['player_id'].unique())

1864

In [68]:
all_rosters[all_rosters['player_id']==8469707]

Unnamed: 0,team_id,season,player_id,full_name,first_name,last_name,jersey_number,position,position_code
1,1,2016,8469707,PA Parenteau,PA,Parenteau,11,Right Wing,RW
551,18,2016,8469707,PA Parenteau,PA,Parenteau,11,Right Wing,RW


In [69]:
all_rosters_roles_wo_duplicates[all_rosters_roles_wo_duplicates['player_id']==8469707]

Unnamed: 0,team_id,player_id,full_name,role
1,1,8469707,PA Parenteau,Forward


### Lets finally export the cleansed dataframe into a csv file

In [70]:
title = '../cleaned_csv/6_playersAllCleaned.csv'
all_rosters_roles_wo_duplicates.to_csv(title, index=True)

In [71]:
title = '../cleaned_csv/7_rostersAllCleaned.csv'
all_rosters.to_csv(title, index=True)

In [32]:
# how to get csv back into dataframe

a = pd.read_csv('../cleaned_csv/rostersAllCleaned.csv', index_col='Unnamed: 0')
teamRosters = a.to_dict(orient='list')
c = pd.DataFrame.from_dict(teamRosters)
c


Unnamed: 0,team_id,season,player_id,full_name,first_name,last_name,jersey_number,position,position_code
0,1,2016,8469500,Michael Cammalleri,Michael,Cammalleri,13,Left Wing,LW
1,1,2016,8469707,PA Parenteau,PA,Parenteau,11,Right Wing,RW
2,1,2016,8469759,Vernon Fiddler,Vernon,Fiddler,83,Center,C
3,1,2016,8470724,Kyle Quincey,Kyle,Quincey,27,Defenseman,D
4,1,2016,8471233,Travis Zajac,Travis,Zajac,14,Center,C
...,...,...,...,...,...,...,...,...,...
8385,55,2023,8478916,Joey Daccord,Joey,Daccord,35,Goalie,G
8386,55,2023,8479977,Kailer Yamamoto,Kailer,Yamamoto,56,Right Wing,RW
8387,55,2023,8480009,Eeli Tolvanen,Eeli,Tolvanen,20,Right Wing,RW
8388,55,2023,8481789,Tye Kartye,Tye,Kartye,52,Left Wing,LW
