### Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import os

### Read CSV file.

In [None]:
df = pd.read_csv("../scraped_csv/team_rosters/20162017_team_roster.csv")
df

### Looking at dataset, there are a couple of things we need to do:
- Find and remove null values
- Remove the .0 at the end of the jersey numbers
- Delete the columns not fitting in the database schema
- Rename column names to align with database schema

So let's go ahead and do these tasks.

#### Find and remove null values

In [None]:
# find null values
df.isna()

Only one null value, thus it is acceptable to use the `dropna()` method.

In [None]:
df = df.dropna()

#### Remove the .0 at the end of jersey numbers

In [None]:
# Removing the .0 at the end of the jersey numbers
df['jerseyNumber'].dtype

The .0 exists as it is a float. We need to change the type to int.

In [None]:
df = df.astype({'jerseyNumber':np.int64})

#### Delete the columns not fitting in the database schema

In [None]:
df

In [None]:
dropped_columns = [
    'person.link', 
    'position.code', 
    'position.type',
    'position.abbreviation' 
]
df = df.drop(columns=dropped_columns)
df

#### Rename column names to align with database schema

In [None]:
renamed_columns = {
    'person.id': 'apiID', 
    'person.fullName': 'name', 
    'position.name': 'position', 
}
df = df.rename(columns=renamed_columns)
df

### Creating extra columns

To make querying easier, we want to create two extra fields: 
- First Name
- Last Name

We will achieve this by:
- Getting full name list from dataframe
- Splitting full name into two lists: `firstName` and `lastName`
- Assigning the two lists as columns in the dataframe

Let's get started!

#### Getting full name list from dataframe

In [None]:
fullName = df['name'].tolist()
fullName

#### Splitting full name into two lists: `firstName` and `lastName`

In [None]:
# making the two lists
firstName = list(map(lambda x: x.split(" ")[0], fullName))
print(firstName)
lastName = list(map(lambda x: x.split(" ")[1], fullName))
print(lastName)

In [None]:
# assigning the two lists as columns in the dataframe
df['firstName'] = firstName
df['lastName'] = lastName
df

### Lets check if the csv shows up as we want to in the format it will be inserted into the database in

In [None]:
payload = df.to_dict(orient='records')
payload

Clearly it does.

### Now, we need to apply the above modifications to all the other roster datasets and then combine them all into one csv file to fit the database schema.

#### This will be done by making a general function and then looping through it to connect all the datasets that:
- Applies modifications to each dataset
- Merges all the datasets together with their year number to fit database schema.

In [None]:
# making function

def cleaning_dataset(dataframe): 
    df = pd.read_csv(dataframe)

    df = df.dropna()
    
    df = df.astype({'jerseyNumber':np.int64})

    dropped_columns = [
        'person.link', 
        'position.code', 
        'position.type',
        'position.abbreviation' 
    ]
    df = df.drop(columns=dropped_columns)

    renamed_columns = {
        'person.id': 'apiID', 
        'person.fullName': 'name', 
        'position.name': 'position', 
    }
    df = df.rename(columns=renamed_columns)

    fullName = df['name'].tolist()
    firstName = list(map(lambda x: x.split(" ")[0], fullName))
    lastName = list(map(lambda x: x.split(" ")[1], fullName))
    df['firstName'] = firstName
    df['lastName'] = lastName

    return df.to_dict(orient='records')



In [None]:
# get list of all roster datasets
roster_datasets_seasons = list(map(lambda x: x.split("_")[0], os.listdir("../scraped_csv/team_rosters/")))
roster_datasets_seasons

In [None]:
# do this for every roster dataset
all_rosters = {}
for season in roster_datasets_seasons:
    all_rosters[f'{season}'] = cleaning_dataset(f"../scraped_csv/team_rosters/{season}_team_roster.csv")
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in all_rosters.items() ]))
df

In [None]:
# replace NaN with 0
df = df.replace(np.NaN, 0)
df

### Lets finally export the cleansed dataframe into a csv file

In [None]:
title = '../cleaned_csv/teamAllRosterDetailsCleaned.csv'
df.to_csv(title, index=False)

In [None]:
# how to get csv back into dataframe

a = pd.read_csv('../cleaned_csv/teamAllRosterDetailsCleaned.csv')
b = a.to_dict(orient='list')
c = pd.DataFrame.from_dict(b)
c