In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from collections import Counter
from datetime import datetime

In [None]:
award = pd.read_csv('/content/drive/MyDrive/SI670Final/awards_data.csv')
player = pd.read_csv('/content/drive/MyDrive/SI670Final/player_stats.csv')
team_rebound = pd.read_csv('/content/drive/MyDrive/SI670Final/team_rebounding_data_22.csv')
team_stat = pd.read_csv('/content/drive/MyDrive/SI670Final/team_stats.csv')

### Find players who are all star

In [None]:
all_star_players=award[award['all_star_game']==True]

In [None]:
 # merge name to all star player data
 player_all_star=pd.merge(all_star_players,player[['player','nbapersonid']],on='nbapersonid', how='left')

In [None]:
# choose players that are drafted 2018
player18 = player[player['draftyear'] == 2018]

In [None]:
player18.player.value_counts()

### Make is_all_star column as 1, 0

In [None]:
player18['is_all_star'] = player18['player'].apply(lambda name: 1 if name in set(player_all_star['player']) else 0)


### Make season as the first to fourth year

In [None]:
player18_filtered2 = player18[player18['season'] - player18['draftyear'] <=3]

In [None]:
player18_filtered2['Year']=player18_filtered2['season'] - player18_filtered2['draftyear']+1

### Remove useless columns

In [None]:
# remove useless column
df = player18_filtered2.drop(['player', 'draftyear', 'season', 'nbateamid', 'team'],axis=1)

### Impute missing value

In [None]:
df.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant', fill_value=100)

In [None]:
### impute 100 for the undrafted players
df['draftpick'] = imputer.fit_transform(df[['draftpick']])

In [None]:
df.isna().sum()

In [None]:
### fill zero for all other missing vlaue because they are 0
df = df.fillna(0)

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
import pandas as pd

# Assuming player07_filtered2 is your DataFrame

# Define the columns for which you want to perform the aggregation
columns_to_aggregate = ['fgp3', 'fgp2', 'efg', 'ftp', 'PER', 'FTr', 'off_reb_pct', 'def_reb_pct', 'tot_reb_pct', 'ast_pct', 'stl_pct',
                        'blk_pct', 'tov_pct', 'usg', 'OWS', 'DWS', 'WS', 'OBPM', 'DBPM', 'BPM', 'VORP' ]  # Add all your columns here

# Define custom aggregation function
def weighted_average(x, weight_column):
    return sum(x * df.loc[x.index, weight_column]) / sum(df.loc[x.index, weight_column])

# Create a dictionary of aggregations
aggregations = {}
for column in columns_to_aggregate:
    if column == 'games':
        aggregations[column] = 'sum'
    else:
        aggregations[column] = lambda x: weighted_average(x, 'games')

# Add columns to sum
columns_to_sum = ['games','games_start','mins','fgm','fga','fgm3','fga3','fgm2','fga2','ftm',
                  'fta','off_reb','def_reb','tot_reb','ast','steals','blocks','tov','tot_fouls','points']  # Add columns to sum

aggregations.update({column: 'sum' for column in columns_to_sum})

# Group by player and season, applying custom aggregation
df_aggregated = df.groupby(['nbapersonid', 'Year']).agg(aggregations).reset_index()

# Rename the columns for clarity if needed
# player07_aggregated.columns = [f'{column}_aggregated' for column in player07_aggregated.columns]

# Display the aggregated DataFrame
print(df_aggregated)


In [None]:
# Pivot the DataFrame
pivoted_df = df.pivot_table(index='nbapersonid', columns='Year', values=df.columns[2:], fill_value=0)

# Flatten the MultiIndex columns
pivoted_df.columns = [f'{col[0]}_{col[1]}' for col in pivoted_df.columns]

# Reset the index to make 'nbapersonid' a regular column
pivoted_df.reset_index(inplace=True)

# Print or use the resulting DataFrame 'pivoted_df'
print(pivoted_df)

In [None]:
### merge true lable to the dataset
player18_filtered3 = player18_filtered2[['nbapersonid', 'is_all_star']]
player18_filtered3.drop_duplicates(inplace = True)
pivoted_df.drop(columns = ['is_all_star_1', 'is_all_star_2', 'is_all_star_3', 'is_all_star_4',
                 'draftpick_1', 'draftpick_2', 'draftpick_3', 'draftpick_4'], inplace = True)

In [None]:
df['draftpick'].isnull().sum()

In [None]:
df_dp = df[['draftpick', 'nbapersonid']].drop_duplicates()

In [None]:
df_dp

In [None]:
pivoted_df2 = pd.merge(pivoted_df, player18_filtered3[['is_all_star','nbapersonid']],how='left',on='nbapersonid')

In [None]:
pivoted_df2 = pd.merge(pivoted_df2, df_dp,how='left',on='nbapersonid')

In [None]:
pivoted_df2.isnull().sum()

In [None]:
pivoted_df2.to_csv('2018data.csv')