# Data Cleaning & Loading SQL

In [1]:
import pandas as pd
import datetime
import csv

from DATA225utils import make_connection, dataframe_query 

In [2]:
# connecttomysql.ini configuration file connects the python to mysql so that we can create the database through python as well.
conn = make_connection(config_file = 'nba.ini')
cursor = conn.cursor()

In [3]:
games_details = pd.read_csv("data/games_details.csv", low_memory = False)

games = pd.read_csv("data/games.csv", low_memory = False)

players = pd.read_csv("data/players.csv", low_memory = False)

ranking = pd.read_csv("data/ranking.csv", low_memory = False)

teams_df = pd.read_csv("data/teams.csv", low_memory = False)

In [4]:
print(games_details.shape)
print(games.shape)
print(players.shape)
print(ranking.shape)
print(teams_df.shape)

(668628, 29)
(26651, 21)
(7228, 4)
(210342, 13)
(30, 14)


In [5]:
games_details = games_details.drop_duplicates(subset = ["GAME_ID", "TEAM_ID", "PLAYER_ID"])
games = games.drop_duplicates(subset = "GAME_ID")
players = players.drop_duplicates(subset = "PLAYER_ID")
ranking = ranking.drop_duplicates(subset = ["TEAM_ID", "STANDINGSDATE"])
teams_df = teams_df.drop_duplicates(subset = "TEAM_ID")

In [6]:
print(games_details.shape)
print(games.shape)
print(players.shape)
print(ranking.shape)
print(teams_df.shape)

(668339, 29)
(26622, 21)
(1769, 4)
(210313, 13)
(30, 14)


In [7]:
games_details.isna().sum()/games_details.shape[0]*100

GAME_ID               0.000000
TEAM_ID               0.000000
TEAM_ABBREVIATION     0.000000
TEAM_CITY             0.000000
PLAYER_ID             0.000000
PLAYER_NAME           0.000000
NICKNAME             92.064357
START_POSITION       61.749352
COMMENT              83.590962
MIN                  16.409188
FGM                  16.409188
FGA                  16.409188
FG_PCT               16.409188
FG3M                 16.409188
FG3A                 16.409188
FG3_PCT              16.409188
FTM                  16.409188
FTA                  16.409188
FT_PCT               16.409188
OREB                 16.409188
DREB                 16.409188
REB                  16.409188
AST                  16.409188
STL                  16.409188
BLK                  16.409188
TO                   16.409188
PF                   16.409188
PTS                  16.409188
PLUS_MINUS           19.949457
dtype: float64

### The missing stats means that the players simply didn't play in the match. So we will drop Nickname, start position an comment which has a lot of missing values and no importance in the analysis

In [8]:
games_details.drop(["NICKNAME", "START_POSITION", "COMMENT", "PLUS_MINUS"], axis = 1, inplace = True)

In [9]:
games.isna().sum()/games.shape[0]

GAME_DATE_EST       0.000000
GAME_ID             0.000000
GAME_STATUS_TEXT    0.000000
HOME_TEAM_ID        0.000000
VISITOR_TEAM_ID     0.000000
SEASON              0.000000
TEAM_ID_home        0.000000
PTS_home            0.003719
FG_PCT_home         0.003719
FT_PCT_home         0.003719
FG3_PCT_home        0.003719
AST_home            0.003719
REB_home            0.003719
TEAM_ID_away        0.000000
PTS_away            0.003719
FG_PCT_away         0.003719
FT_PCT_away         0.003719
FG3_PCT_away        0.003719
AST_away            0.003719
REB_away            0.003719
HOME_TEAM_WINS      0.000000
dtype: float64

In [10]:
games[games.PTS_away.isna()].GAME_DATE_EST.value_counts()

2003-10-14    10
2003-10-24     9
2003-10-11     9
2003-10-17     9
2003-10-22     8
2003-10-18     7
2003-10-16     6
2003-10-23     6
2003-10-21     5
2003-10-19     5
2003-10-10     5
2003-10-09     4
2003-10-15     4
2003-10-20     4
2003-10-12     3
2003-10-13     2
2003-10-08     2
2003-10-07     1
Name: GAME_DATE_EST, dtype: int64

### There is no data even on the official NBA site for these datapoints, as they are all concentrated in 2003 we will just drop 2003 data and start from 2004.

In [11]:
games.isna().sum()

GAME_DATE_EST        0
GAME_ID              0
GAME_STATUS_TEXT     0
HOME_TEAM_ID         0
VISITOR_TEAM_ID      0
SEASON               0
TEAM_ID_home         0
PTS_home            99
FG_PCT_home         99
FT_PCT_home         99
FG3_PCT_home        99
AST_home            99
REB_home            99
TEAM_ID_away         0
PTS_away            99
FG_PCT_away         99
FT_PCT_away         99
FG3_PCT_away        99
AST_away            99
REB_away            99
HOME_TEAM_WINS       0
dtype: int64

In [12]:
games = games[games.GAME_DATE_EST>'2004-01-01']

In [13]:
players.isna().sum()/players.shape[0]*100

PLAYER_NAME    0.0
TEAM_ID        0.0
PLAYER_ID      0.0
SEASON         0.0
dtype: float64

### Doing the same for the ranking table as they are the only two tables with date 

In [14]:
ranking = ranking[ranking.STANDINGSDATE>'2004-01-01']

In [15]:
ranking.isna().sum()/ranking.shape[0]*100

TEAM_ID           0.000000
LEAGUE_ID         0.000000
SEASON_ID         0.000000
STANDINGSDATE     0.000000
CONFERENCE        0.000000
TEAM              0.000000
G                 0.000000
W                 0.000000
L                 0.000000
W_PCT             0.000000
HOME_RECORD       0.000000
ROAD_RECORD       0.000000
RETURNTOPLAY     98.078183
dtype: float64

In [16]:
ranking.drop("RETURNTOPLAY", axis = 1, inplace = True)

In [17]:
teams_df.isna().sum()/teams_df.shape[0]*100

LEAGUE_ID              0.000000
TEAM_ID                0.000000
MIN_YEAR               0.000000
MAX_YEAR               0.000000
ABBREVIATION           0.000000
NICKNAME               0.000000
YEARFOUNDED            0.000000
CITY                   0.000000
ARENA                  0.000000
ARENACAPACITY         13.333333
OWNER                  0.000000
GENERALMANAGER         0.000000
HEADCOACH              0.000000
DLEAGUEAFFILIATION     0.000000
dtype: float64

Let ArenaCapacity be null

## Loading Players

In [18]:
players.head()

Unnamed: 0,PLAYER_NAME,TEAM_ID,PLAYER_ID,SEASON
0,Royce O'Neale,1610612762,1626220,2019
1,Bojan Bogdanovic,1610612762,202711,2019
2,Rudy Gobert,1610612762,203497,2019
3,Donovan Mitchell,1610612762,1628378,2019
4,Mike Conley,1610612762,201144,2019


In [19]:
df = pd.read_csv("data/all_seasons.csv")

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Dennis Rodman,CHI,36.0,198.12,99.79024,Southeastern Oklahoma State,USA,1986,2,...,5.7,16.1,3.1,16.1,0.186,0.323,0.1,0.479,0.113,1996-97
1,1,Dwayne Schintzius,LAC,28.0,215.9,117.93392,Florida,USA,1990,1,...,2.3,1.5,0.3,12.3,0.078,0.151,0.175,0.43,0.048,1996-97
2,2,Earl Cureton,TOR,39.0,205.74,95.25432,Detroit Mercy,USA,1979,3,...,0.8,1.0,0.4,-2.1,0.105,0.102,0.103,0.376,0.148,1996-97
3,3,Ed O'Bannon,DAL,24.0,203.2,100.697424,UCLA,USA,1995,1,...,3.7,2.3,0.6,-8.7,0.06,0.149,0.167,0.399,0.077,1996-97
4,4,Ed Pinckney,MIA,34.0,205.74,108.86208,Villanova,USA,1985,1,...,2.4,2.4,0.2,-11.2,0.109,0.179,0.127,0.611,0.04,1996-97


In [21]:
def get_height(player_name):
    try:
        return list(df[df.player_name==player_name]["player_height"].drop_duplicates())[0]
    except:
        return None

def get_draft_year(player_name):
    try:
        return list(df[df.player_name==player_name]["draft_year"].drop_duplicates())[0]
    except:
        return None

In [22]:
players["HEIGHT"] = players.PLAYER_NAME.apply(lambda x: get_height(x))
players["DRAFTYEAR"] = players.PLAYER_NAME.apply(lambda x: get_draft_year(x))

In [23]:
players.isna().sum()

PLAYER_NAME      0
TEAM_ID          0
PLAYER_ID        0
SEASON           0
HEIGHT         347
DRAFTYEAR      347
dtype: int64

In [24]:
# from nba_api.stats.endpoints import CommonPlayerInfo
# import time

# new_list = []

# for i in players[players.HEIGHT.isna()].PLAYER_ID:
#     time.sleep(.5)
#     try:
#         new_list.append((i, CommonPlayerInfo(i).get_data_frames()[0][["HEIGHT", "DRAFT_YEAR"]]))
#     except:
#         new_list.append(i, None, None)

In [25]:
# len(new_list)

In [26]:
players[players.HEIGHT.isna()]

Unnamed: 0,PLAYER_NAME,TEAM_ID,PLAYER_ID,SEASON,HEIGHT,DRAFTYEAR
428,Charles Brown Jr.,1610612737,1629718,2019,,
456,Michael Frazier,1610612745,1626187,2019,,
478,Christ Koumadje,1610612755,1629746,2019,,
479,Isaiah Miles,1610612755,1627776,2019,,
485,CJ Massinburg,1610612751,1629747,2019,,
...,...,...,...,...,...,...
7217,David Monds,1610612747,201776,2009,,
7223,Lanny Smith,1610612758,201831,2009,,
7224,Warren Carter,1610612752,201999,2009,,
7225,Bennet Davis,1610612751,201834,2009,,


In [27]:
sql_create = ( """
                    CREATE TABLE players (player_name VARCHAR(255) NOT NULL,
                                        team_id BIGINT NOT NULL,
                                        player_id BIGINT NOT NULL,
                                        season INT NOT NULL,
                                        height INT,
                                        draftyear VARCHAR(20),
                                        PRIMARY KEY (player_id))
                """)

cursor.execute(sql_create);

sql_insert = (   """
            INSERT INTO players
            VALUES (%s, %s, %s, %s, %s, %s)
            """
        )

for index, row in players.iterrows():
    values = (row['PLAYER_NAME'], row['TEAM_ID'], row['PLAYER_ID'], row['SEASON'], \
              row['HEIGHT'], row['DRAFTYEAR'])
    cursor.execute(sql_insert, values)
    
conn.commit()

## Loading Teams

In [28]:
teams_df.head()

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION,NICKNAME,YEARFOUNDED,CITY,ARENA,ARENACAPACITY,OWNER,GENERALMANAGER,HEADCOACH,DLEAGUEAFFILIATION
0,0,1610612737,1949,2019,ATL,Hawks,1949,Atlanta,State Farm Arena,18729.0,Tony Ressler,Travis Schlenk,Lloyd Pierce,Erie Bayhawks
1,0,1610612738,1946,2019,BOS,Celtics,1946,Boston,TD Garden,18624.0,Wyc Grousbeck,Danny Ainge,Brad Stevens,Maine Red Claws
2,0,1610612740,2002,2019,NOP,Pelicans,2002,New Orleans,Smoothie King Center,,Tom Benson,Trajan Langdon,Alvin Gentry,No Affiliate
3,0,1610612741,1966,2019,CHI,Bulls,1966,Chicago,United Center,21711.0,Jerry Reinsdorf,Gar Forman,Jim Boylen,Windy City Bulls
4,0,1610612742,1980,2019,DAL,Mavericks,1980,Dallas,American Airlines Center,19200.0,Mark Cuban,Donnie Nelson,Rick Carlisle,Texas Legends


### Changing New Orleans Pelicans conference to West (it changed from East to West in the 2004 season)

In [29]:
ranking.loc[ranking.TEAM_ID==1610612740,'CONFERENCE'] = "West"

In [30]:
teams_df = pd.merge(teams_df,ranking[["TEAM_ID", "CONFERENCE"]].drop_duplicates(), on = "TEAM_ID", how='left')

In [31]:
from nba_api.stats.static import teams

teams_df["STATE"] = teams_df.NICKNAME.apply(lambda x: teams.find_teams_by_nickname(x)[0]["state"])

In [32]:
df.columns

Index(['Unnamed: 0', 'player_name', 'team_abbreviation', 'age',
       'player_height', 'player_weight', 'college', 'country', 'draft_year',
       'draft_round', 'draft_number', 'gp', 'pts', 'reb', 'ast', 'net_rating',
       'oreb_pct', 'dreb_pct', 'usg_pct', 'ts_pct', 'ast_pct', 'season'],
      dtype='object')

In [33]:
teams_df.head()

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION,NICKNAME,YEARFOUNDED,CITY,ARENA,ARENACAPACITY,OWNER,GENERALMANAGER,HEADCOACH,DLEAGUEAFFILIATION,CONFERENCE,STATE
0,0,1610612737,1949,2019,ATL,Hawks,1949,Atlanta,State Farm Arena,18729.0,Tony Ressler,Travis Schlenk,Lloyd Pierce,Erie Bayhawks,East,Georgia
1,0,1610612738,1946,2019,BOS,Celtics,1946,Boston,TD Garden,18624.0,Wyc Grousbeck,Danny Ainge,Brad Stevens,Maine Red Claws,East,Massachusetts
2,0,1610612740,2002,2019,NOP,Pelicans,2002,New Orleans,Smoothie King Center,,Tom Benson,Trajan Langdon,Alvin Gentry,No Affiliate,West,Louisiana
3,0,1610612741,1966,2019,CHI,Bulls,1966,Chicago,United Center,21711.0,Jerry Reinsdorf,Gar Forman,Jim Boylen,Windy City Bulls,East,Illinois
4,0,1610612742,1980,2019,DAL,Mavericks,1980,Dallas,American Airlines Center,19200.0,Mark Cuban,Donnie Nelson,Rick Carlisle,Texas Legends,West,Texas


In [34]:
sql_create = ( """
                    CREATE TABLE teams (league_id int NOT NULL,
                                        team_id BIGINT NOT NULL,
                                        min_year INT NOT NULL,
                                        max_year INT NOT NULL,
                                        abbreviation VARCHAR(5) NOT NULL,
                                        nickname VARCHAR(25) NOT NULL,
                                        year_founded INT NOT NULL,
                                        city VARCHAR(15) NOT NULL,
                                        arena VARCHAR(50) NOT NULL,
                                        arena_capacity BIGINT,
                                        owner VARCHAR(40) NOT NULL,
                                        general_manager VARCHAR(40) NOT NULL,
                                        head_coach VARCHAR(40) NOT NULL,
                                        d_league_affiliation VARCHAR(40) NOT NULL,
                                        state VARCHAR(25) NOT NULL,
                                        conference VARCHAR(4) NOT NULL,
                                        PRIMARY KEY (team_id))
                """)

cursor.execute(sql_create);

sql_insert = (   """
            INSERT INTO teams
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
        )

for index, row in teams_df.iterrows():
    values = (row['LEAGUE_ID'], row['TEAM_ID'], row['MIN_YEAR'], row['MAX_YEAR'],
              row['ABBREVIATION'], row['NICKNAME'], row['YEARFOUNDED'], row['CITY'],
              row['ARENA'], row['ARENACAPACITY'], row['OWNER'], row['GENERALMANAGER'],
              row['HEADCOACH'], row['DLEAGUEAFFILIATION'], row['STATE'],
              row['CONFERENCE'])
    cursor.execute(sql_insert, values)
    
conn.commit()

## Loading Ranking

In [36]:
df

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Dennis Rodman,CHI,36.0,198.12,99.790240,Southeastern Oklahoma State,USA,1986,2,...,5.7,16.1,3.1,16.1,0.186,0.323,0.100,0.479,0.113,1996-97
1,1,Dwayne Schintzius,LAC,28.0,215.90,117.933920,Florida,USA,1990,1,...,2.3,1.5,0.3,12.3,0.078,0.151,0.175,0.430,0.048,1996-97
2,2,Earl Cureton,TOR,39.0,205.74,95.254320,Detroit Mercy,USA,1979,3,...,0.8,1.0,0.4,-2.1,0.105,0.102,0.103,0.376,0.148,1996-97
3,3,Ed O'Bannon,DAL,24.0,203.20,100.697424,UCLA,USA,1995,1,...,3.7,2.3,0.6,-8.7,0.060,0.149,0.167,0.399,0.077,1996-97
4,4,Ed Pinckney,MIA,34.0,205.74,108.862080,Villanova,USA,1985,1,...,2.4,2.4,0.2,-11.2,0.109,0.179,0.127,0.611,0.040,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12300,12300,Markieff Morris,MIA,32.0,205.74,111.130040,Kansas,USA,2011,1,...,7.6,2.6,1.4,4.5,0.059,0.089,0.197,0.547,0.116,2021-22
12301,12301,Markelle Fultz,ORL,24.0,193.04,94.800728,Washington,USA,2017,1,...,10.8,2.7,5.5,-5.3,0.010,0.116,0.265,0.517,0.448,2021-22
12302,12302,Marcus Smart,BOS,28.0,193.04,99.790240,Oklahoma State,USA,2014,1,...,12.1,3.8,5.9,9.3,0.018,0.093,0.179,0.540,0.245,2021-22
12303,12303,Marcus Garrett,MIA,23.0,195.58,92.986360,Kansas,USA,Undrafted,Undrafted,...,1.1,1.9,0.6,5.8,0.072,0.108,0.086,0.280,0.069,2021-22


In [37]:
ranking.head()

Unnamed: 0,TEAM_ID,LEAGUE_ID,SEASON_ID,STANDINGSDATE,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD
0,1610612743,0,22022,2022-12-22,West,Denver,30,19,11,0.633,10-3,9-8
1,1610612763,0,22022,2022-12-22,West,Memphis,30,19,11,0.633,13-2,6-9
2,1610612740,0,22022,2022-12-22,West,New Orleans,31,19,12,0.613,13-4,6-8
3,1610612756,0,22022,2022-12-22,West,Phoenix,32,19,13,0.594,14-4,5-9
4,1610612746,0,22022,2022-12-22,West,LA Clippers,33,19,14,0.576,11-7,8-7


In [38]:
sql_create = ( """
                    CREATE TABLE ranking (team_id BIGINT NOT NULL,
                                          league_id INT NOT NULL,
                                          season_id INT NOT NULL,
                                          standings_date DATE NOT NULL,
                                          conference VARCHAR(4) NOT NULL,
                                          games_played INT NOT NULL,
                                          games_won INT NOT NULL,
                                          games_lost INT NOT NULL,
                                          win_pct FLOAT NOT NULL,
                                          home_record VARCHAR(10) NOT NULL,
                                          road_record VARCHAR(10) NOT NULL,
                                          PRIMARY KEY(team_id, standings_date)
                    )
                """)

cursor.execute(sql_create);

sql_insert = (   """
            INSERT INTO ranking
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
        )

for index, row in ranking.iterrows():
    values = (row['TEAM_ID'], row['LEAGUE_ID'], row['SEASON_ID'], row['STANDINGSDATE'],
              row['CONFERENCE'], row['G'], row['W'],
              row['L'], row['W_PCT'], row['HOME_RECORD'], row['ROAD_RECORD'])
    cursor.execute(sql_insert, values)
    
conn.commit()

## Loading Games

In [39]:
games.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,...,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,...,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,...,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,...,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,...,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0


In [40]:
sql_create = ( """
                    CREATE TABLE games (game_date DATE NOT NULL,
                                          game_id BIGINT NOT NULL,
                                          game_status_text VARCHAR(6) NOT NULL,
                                          home_team_id BIGINT NOT NULL,
                                          visitor_team_id BIGINT NOT NULL,
                                          season INT NOT NULL,
                                          home_team_wins INT NOT NULL,
                                          PRIMARY KEY(game_id)
                    )
                """)

cursor.execute(sql_create);

sql_insert = (   """
            INSERT INTO games
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            """
        )

for index, row in games.iterrows():
    values = (row['GAME_DATE_EST'], row['GAME_ID'], row['GAME_STATUS_TEXT'], row['HOME_TEAM_ID'],
              row['VISITOR_TEAM_ID'], row['SEASON'], row['HOME_TEAM_WINS'])
    cursor.execute(sql_insert, values)
    
conn.commit()

In [41]:
sql_create = ( """
                    CREATE TABLE team_stats (game_date DATE NOT NULL,
                                          game_id BIGINT NOT NULL,
                                          team_id BIGINT NOT NULL,
                                          pts INT NOT NULL,
                                          fg_pct INT NOT NULL,
                                          ft_pct INT NOT NULL,
                                          fg3_pct INT NOT NULL,
                                          ast INT NOT NULL,
                                          reb INT NOT NULL,
                                          PRIMARY KEY(game_id, team_id)
                    )
                """)

cursor.execute(sql_create);

sql_insert = (   """
            INSERT INTO team_stats
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
        )

for index, row in games.iterrows():
    values = (row['GAME_DATE_EST'], row['GAME_ID'], row['TEAM_ID_home'], row['PTS_home'],
              row['FG_PCT_home'], row['FT_PCT_home'], row['FG3_PCT_home'], row['AST_home'],
              row['REB_home'])
    cursor.execute(sql_insert, values)

for index, row in games.iterrows():
    values = (row['GAME_DATE_EST'], row['GAME_ID'], row['TEAM_ID_away'], row['PTS_away'],
              row['FG_PCT_away'], row['FT_PCT_away'], row['FG3_PCT_away'], row['AST_away'],
              row['REB_away'])
    cursor.execute(sql_insert, values)
    
conn.commit()

## Loading Games Details

In [42]:
games_details.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS
0,22200477,1610612759,SAS,San Antonio,1629641,Romeo Langford,18:06,1.0,1.0,1.0,...,0.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0
1,22200477,1610612759,SAS,San Antonio,1631110,Jeremy Sochan,31:01,7.0,14.0,0.5,...,0.7,6.0,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0
2,22200477,1610612759,SAS,San Antonio,1627751,Jakob Poeltl,21:42,6.0,9.0,0.667,...,1.0,1.0,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0
3,22200477,1610612759,SAS,San Antonio,1630170,Devin Vassell,30:20,4.0,13.0,0.308,...,1.0,0.0,9.0,9.0,5.0,3.0,0.0,2.0,1.0,10.0
4,22200477,1610612759,SAS,San Antonio,1630200,Tre Jones,27:44,7.0,12.0,0.583,...,1.0,0.0,2.0,2.0,3.0,0.0,0.0,2.0,2.0,19.0


In [43]:
def convert_to_sec(x):
    if pd.isnull(x):
        return x
    else:
        if len(x.split(":")) > 1:
            return int(float(x.split(":")[0]))*60+int(float(x.split(":")[1]))
        else:
            return int(float(x))*60

In [44]:
games_details["seconds"] = games_details.MIN.apply(lambda x: convert_to_sec(x))

In [45]:
sql_create = ( """
                    CREATE TABLE player_game_stats (game_id BIGINT NOT NULL,
                                          team_id BIGINT NOT NULL,
                                          player_id BIGINT NOT NULL,
                                          seconds BIGINT,
                                          FGM INT,
                                          FGA INT,
                                          FG_PCT INT,
                                          FG3M INT,
                                          FG3A INT,
                                          FG3_PCT INT,
                                          FTM INT,
                                          FTA INT,
                                          FT_PCT INT,
                                          OREB INT,
                                          DREB INT,
                                          REB INT,
                                          AST INT,
                                          STL INT,
                                          BLK INT,
                                          TURNOVERS INT,
                                          PF INT,
                                          PTS INT,
                                          PRIMARY KEY(game_id, player_id)
                    )
                """)

cursor.execute(sql_create);

sql_insert = (   """
            INSERT INTO player_game_stats
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
        )

for index, row in games_details.iterrows():
    values = (row['GAME_ID'], row['TEAM_ID'], row['PLAYER_ID'], row['seconds'],
              row['FGM'], row['FGA'], row['FG_PCT'], row['FG3M'],
              row['FG3A'], row['FG3_PCT'], row['FTM'], row['FTA'],
              row['FT_PCT'], row['OREB'], row['DREB'], row['REB'],
              row['AST'], row['STL'], row['BLK'], row['TO'],
              row['PF'], row['PTS'])
    cursor.execute(sql_insert, values)
    
conn.commit()

In [46]:
cursor.close()
conn.close()