# Notebook 2:  Data Cleaning 

Sometimes through the scraping or manual download from the website, unnecessary columns or cells with no value occur.  These should be deleted before using them in the model. 
Additonally the salaries need to be cleaned. For that we transform them into a numerical format and normalize them in respect to the salary limit of each year. (The limit is usually increased every year by the NBA.)

## Imports

In [206]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup, Comment
import requests
import lxml
import unicodedata
import json, pickle

## Setting up the cleaning of the players data

In [72]:
def strip_accents_and_punctuation(text):
    '''Normalize player name spellings'''
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text).replace('.','').replace(',','').replace("'",'')

# (1) Merge Scraped Data

Merging datasets

In [184]:
#loading all the different dataframes, which we want to combine into one

df2022 = pd.read_csv('./data/df2022_raw.csv');
df2023salaries = pd.read_csv('./data/dfcurrentsalaries.csv');
df2023stats = pd.read_csv('./data/dfcurrentstats.csv');
df2023urls = pd.read_csv('./data/dfplayer_to_url.csv');
df2021_merged = pd.read_csv('./data/df2021_merged.csv');
df2023_salary = pd.read_csv('./data\df2023_salary.csv'); 


## a. Update df2022 with 2023 salaries

In [185]:
# Merge df2022_merged with df2021_merged to get "PrevSal"
df2022 = df2022.merge(df2021_merged[['Name', 'Salary']], on='Name', how='left')

df2022.rename(columns={'Salary': 'PrevSal'}, inplace=True)
df2022.rename(columns = {'CurrentSalary':'Salary'})

# Merge the merged_df with df2023_salary to get "NextSal"
df2022 = df2022.merge(df2023_salary[['Player', '2023-24']], left_on='Name', right_on='Player', how='left')

df2022.rename(columns={'2023-24': 'NextSal'}, inplace=True)

# Adding new columns, which will be used later on, for differentiating between years.
df2022['NameYear'] = '2022'
df2022['PrevYear'] = '2021'
df2022['NextYear'] = '2023'

# Dropping redundant columns, which are generated through the merging.
df2022.drop('Player', axis=1, inplace=True)
df2022.drop('Unnamed: 0.1', axis=1, inplace=True)
df2022.drop('Unnamed: 0', axis=1, inplace=True)

df2022.head(5)

Unnamed: 0,Name,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,WS,WS/48,CurrentSalary,CurrentTeam,ID,PrevSal,NextSal,NameYear,PrevYear,NextYear
0,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,0.439,...,2.5,0.07,"$4,379,527",TOR,/players/a/achiupr01.html,"$2,711,280",$4379527,2022,2021,2023
1,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,0.547,...,6.8,0.163,"$12,600,000",MEM,/players/a/adamsst01.html,"$17,073,171",$12600000,2022,2021,2023
2,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,0.557,...,7.2,0.188,"$32,600,060",MIA,/players/a/adebaba01.html,"$28,103,550",$32600060,2022,2021,2023
3,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,0.402,...,0.3,0.044,"$2,194,200",MEM,/players/a/aldamsa01.html,"$1,994,520",$2194200,2022,2021,2023
4,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,0.55,...,3.1,0.141,,,,"$2,641,691",,2022,2021,2023


## b. Update df2022 with 2023 salaries and player urls

In [186]:
df2022_merged = pd.merge(df2023stats, df2023salaries, left_on="Player", right_on="Name", how="left")

df2022_merged = pd.merge(df2022, df2023urls, left_on="Name", right_on="Name", how="left")


df2022_merged = df2022_merged.rename(columns={'Player': 'Name'})

# Dropping different columns due to the merging.
df2022_merged.drop('Unnamed: 0', axis=1, inplace=True)
df2022_merged.drop('ID_x', axis=1, inplace=True)
df2022_merged.drop('ID_y', axis=1, inplace=True)
#df2022_merged.drop('Unnamed: 0_x', axis=1, inplace=True)
#df2022_merged.drop('Unnamed: 0_y', axis=1, inplace=True)

df2022_merged.head(5)

Unnamed: 0,Name,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,VORP,WS,WS/48,CurrentSalary,CurrentTeam,PrevSal,NextSal,NameYear,PrevYear,NextYear
0,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,0.439,...,-0.2,2.5,0.07,"$4,379,527",TOR,"$2,711,280",$4379527,2022,2021,2023
1,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,0.547,...,2.0,6.8,0.163,"$12,600,000",MEM,"$17,073,171",$12600000,2022,2021,2023
2,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,0.557,...,2.7,7.2,0.188,"$32,600,060",MIA,"$28,103,550",$32600060,2022,2021,2023
3,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,0.402,...,-0.3,0.3,0.044,"$2,194,200",MEM,"$1,994,520",$2194200,2022,2021,2023
4,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,0.55,...,0.7,3.1,0.141,,,"$2,641,691",,2022,2021,2023


## Save merged data

In [187]:
df2022_merged.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2022_merged.csv')

# (2) Clean salary data

## Load files

In [216]:
df2020_merged = pd.read_csv('./data/df2020_merged.csv')
df2021_merged = pd.read_csv('./data/df2021_merged.csv')
df2022 = pd.read_csv('./data/df2022_merged.csv')

# Dropping one column, which is being generated every time one loads the dataframe. (Unknown why that is the case.)
df2022.drop('Unnamed: 0', axis=1, inplace=True)

In [217]:
df2022.head(5)

Unnamed: 0,Name,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,VORP,WS,WS/48,CurrentSalary,CurrentTeam,PrevSal,NextSal,NameYear,PrevYear,NextYear
0,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,0.439,...,-0.2,2.5,0.07,"$4,379,527",TOR,"$2,711,280",$4379527,2022,2021,2023
1,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,0.547,...,2.0,6.8,0.163,"$12,600,000",MEM,"$17,073,171",$12600000,2022,2021,2023
2,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,0.557,...,2.7,7.2,0.188,"$32,600,060",MIA,"$28,103,550",$32600060,2022,2021,2023
3,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,0.402,...,-0.3,0.3,0.044,"$2,194,200",MEM,"$1,994,520",$2194200,2022,2021,2023
4,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,0.55,...,0.7,3.1,0.141,,,"$2,641,691",,2022,2021,2023


## Delete unnecessary entries

Some data should be deleted, as they are not representative or not complete enough to receive a proper result. 

These would include players, who have not played a single game in the season and those who not received a salary for the upcoming season. This is because sometimes players receive a huge salary increasing by signing a new contract, due to the salary limitations set by the NBA. (E.g. if a player is out of his first contract with around $1-2 million ,he can negotiate a contract, which is tenfold.)
Meaning that players at the end of a contract, are much more dificult to analyse. First we would need to accurately forecast what the salary of a player will be, which we could only base on performance indicators. (Due to the scope of the project and data availability.) The same indicators we would then also use to determine if the salary is justified by the performance. 

Consequently we are discarding those players.


In [218]:
# We are only looking into rows, which have a value for "CurrentSalary".
df2022 = df2022[df2022['CurrentSalary'].notnull()]

In [219]:
#  1. Mask for players who played at least one game.

df2022['G'] = df2022['G'].apply(lambda s: pd.to_numeric(s, errors='coerce'))
mask1 = df2022['G'].isna()
df2022 = df2022[~mask1]

#  2. Mask for players who have a salary for next year.

mask2  = (df2022['NextSal'] == '0') | df2022['NextSal'].isna()  
df2022 = df2022[~mask2]
df2022.sample(5)

Unnamed: 0,Name,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,VORP,WS,WS/48,CurrentSalary,CurrentTeam,PrevSal,NextSal,NameYear,PrevYear,NextYear
289,Montrezl Harrell,C,28,WAS,46,3,24.3,5.4,8.3,0.645,...,1.9,5.1,0.217,"$2,760,026",PHI,"$9,720,900",$2019706,2022,2021,2023
139,Pat Connaughton,SF,29,MIL,65,19,26.0,3.5,7.6,0.458,...,1.4,4.4,0.125,"$9,423,869",MIL,"$5,333,333",$9423869,2022,2021,2023
494,T.J. McConnell,PG,29,IND,27,8,24.1,3.7,7.7,0.481,...,0.5,1.3,0.098,"$8,700,000",IND,"$7,500,000",$8700000,2022,2021,2023
477,Cody Martin,SF,26,CHO,71,11,26.3,2.9,6.0,0.482,...,0.7,3.9,0.101,"$7,560,000",CHO,"$1,782,621",$7560000,2022,2021,2023
503,Jordan McLaughlin,PG,25,MIN,62,3,14.5,1.4,3.2,0.44,...,0.6,2.3,0.123,"$2,320,000",MIN,"$1,669,178",$2320000,2022,2021,2023


### Clean salries and normalize by salary cap

In order to fully normalize our data in regards to salary, we use the limitation for the salary, set by the NBA, from 2023 for all the years. This is a simpliflying approach.
We had to use a lot of transformations and with many hours of try and error, we managed to make the code work. It is definetely improvable, but we did not dare to touch it again.
Following the slogan "never touch a running system".

In [235]:
# Convert 'PrevYear' and 'NextYear' columns to integers
df2022['PrevYear'] = df2022['PrevYear'].astype(int)
df2022['NextYear'] = df2022['NextYear'].astype(int)

def clean_salary(sal, year):
    # Adds all salaries earned per year into one salary (in millions of dollars)
    #   "< $Minimum" is treated as $0

    if isinstance(sal, float):
        return 0
    sal = str(sal)  # Convert sal to string if it's not already    

    newlist = [0]
    for elem in sal.split('$'):
        try:
            elem = elem.replace('<','').replace(',','').replace('(TW)','').replace('Minimum','0')
            #elem =  float(elem)
            newlist.append(float(elem))
        except:
            continue
            
    salary = sum(newlist)/1000000
    
    return salary

# Salary Cap Data taken from spotrac.com/nba/cba
salarycap = {2016:  94.143, 2017: 99.093,   2018: 101.869,  2019:  109.14,   
             2020:  109.14,    2021: 112.414,   2022: 123.655,  2023: 136.021}   

def clean_and_norm_salary(sal, year):
    #Adds all salaries earned per year into one salary (in millions of dollars)
    # < $Minimum" is treated as $0
    # Then normalizes by salary cap (in 2023 salary cap dollars)

    if isinstance(sal, float):  # Check if sal is not a string
        return 0  # Return 0 for non-string values

    sal = str(sal)  # Convert sal to string if it's not already

    newlist = [0]
    for elem in sal.split('$'):
        try:
            elem = elem.replace('<','').replace(',','').replace('Minimum','0')
            elem =  float(elem)
            newlist.append(elem)
        except:
            continue
            
    salary = sum(newlist)/1000000
    salary = salary*salarycap[2023]/salarycap[year]
        
    return salary

    df2022_merged = df2022_merged.reset_index(drop=True)

# Calculate the previous and next years
df2022['PrevYear'] = df2022['PrevYear']
df2022['NextYear'] = df2022['PrevYear'] + 1

# Apply the functions to entire columns using vectorized operations
df2022['PrevSalClean'] = df2022.apply(lambda row: clean_salary(row['PrevSal'], row['PrevYear']), axis=1)
df2022['NextSalClean'] = df2022.apply(lambda row: clean_salary(row['NextSal'], row['NextYear']), axis=1)
df2022['PrevSalNorm'] = df2022.apply(lambda row: clean_and_norm_salary(row['PrevSal'], row['PrevYear']), axis=1)
df2022['NextSalNorm'] = df2022.apply(lambda row: clean_and_norm_salary(row['NextSal'], row['NextYear']), axis=1)

cols = ['NextSal', 'NextSalClean', 'NextSalNorm']
print(df2022[cols].sample(8))

       NextSal  NextSalClean  NextSalNorm
160  $40600080     40.600080    44.660252
790  $11571429     11.571429    12.728619
777  $18357143     18.357143    20.192932
32   $10900635     10.900635    11.990743
812  $10933333     10.933333    12.026711
491  $35802469     35.802469    39.382861
584   $6500000      6.500000     7.150026
477   $7560000      7.560000     8.316031


"for index, row in df.iterrows():\n    prevyear = df2022.loc[index, 'PrevYear']\n    nextyear = df2022.loc[index, 'PrevYear'] + 1 \n\n    prevsal = df2022.loc[index, 'PrevSal']\n    nextsal = df2022.loc[index, 'NextSal']\n    \n    df2022.loc[index, 'PrevSalClean'] = clean_salary(prevsal, prevyear)\n    df2022.loc[index, 'NextSalClean'] = clean_salary(nextsal, nextyear)\n    df2022.loc[index, 'PrevSalNorm'] = clean_and_norm_salary(prevsal, prevyear)\n    df2022.loc[index, 'NextSalNorm'] = clean_and_norm_salary(nextsal, nextyear)\n   \ncols = ['NextSal', 'NextSalClean','NextSalNorm']\ndf2022[cols].sample(8)"

In [236]:
for index, row in df2022.iterrows():
    
    salary = df2022.loc[index, 'CurrentSalary']
    try:
        df2022.loc[index, 'SalClean'] = clean_salary(CurrentSalary, 2023)
    except:
        df2022.loc[index, 'SalClean'] = np.nan
        
    try:
        df2022.loc[index, 'SalNorm'] = norm_salary(CurrentSalary, 2023)
    except:
        df2022.loc[index, 'SalNorm'] = np.nan

df2022.sample(3)

Unnamed: 0,Name,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,NextSal,NameYear,PrevYear,NextYear,SalClean,SalNorm,PrevSalClean,NextSalClean,PrevSalNorm,NextSalNorm
772,Trendon Watford,SF,21,POR,48,10,18.1,3.0,5.6,0.532,...,$1836096,2022,2021,2022,,,0.260561,1.836096,0.315279,2.019713
292,Joe Harris,SF,30,BRK,14,14,30.2,4.0,8.9,0.452,...,$19928571,2022,2021,2022,,,17.357143,19.928571,21.002152,21.921509
207,Joel Embiid,C,27,PHI,68,68,33.8,9.8,19.6,0.499,...,$47607350,2022,2021,2022,,,31.57939,47.60735,38.211079,52.368278


## Save cleaned data

df.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/cleaned_pastyears.csv')
df2021.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/cleaned_currentyear.csv')



In [237]:
#df.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\cleaned_pastyears.csv')
df2022.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\cleaned_currentyear.csv')
