In [None]:
import numpy as np
import pandas as pd
import psycopg
from sqlalchemy import create_engine, text
import dotenv
import os
import sqlite3
import mysql.connector
import pymongo


In [None]:
dotenv.load_dotenv("C:\\Users\\qaism\\OneDrive - University of Virginia\\Documents\\GitHub\\MSDS\\ds6001databases\\.env")
postgres_password = os.getenv('POSTGRES_PASSWORD')
csv_path = "C:\\Users\\qaism\\OneDrive - University of Virginia\\Documents\\GitHub\\MSDS\\ds6001databases\\ASA All NBA Raw Data.csv"

nba = pd.read_csv(csv_path, low_memory=False)

In [None]:
pd.set_option('display.max_rows', 81)
nba.head(3).T

## Database Normalization
### First normal form:

1. **All tables must have a primary key**: In this table, `game_id` and `player_id` together are unique on every row, and so they form primary key.

2. **All the data must be atomic**: Inactives is non-atomic.

3. **No repeating groups problem**: We can't solve the non-atomicity problem by creating separate columns if this leads to arbitrary ordering language in the column names (for example, `Inactive1`, `Inactive2`, etc.) and if it leads to a lot of missing data (there would be an `Inactive7` which would be missing any time a team has less than 7 inactive players).

In [None]:
nba = nba.drop(['Inactives'], axis=1)
games = nba[['game_id', 'game_date', 'OT', 'season']].drop_duplicates()
players = nba[['player_id', 'player']].drop_duplicates()
nba = nba.drop(['game_date', 'OT', 'season', 'player'], axis=1)
nba.head()

### Functional Dependence
Let X and Y be columns in a data table. Y is functionally dependent on X if each value of X has exactly one value of Y.

That's pretty abstract. So here are some guidelines that help me:

1. This use of "function" is the exact same as the concept of a function from algebra and pre-calculus. A correspondence f(x)=y is a function if each value of x has only one associated value of y.

2. X is either a primary key, or something that should be a primary key in another table.

For example, `game_date` (Y) is functionally dependent on `game_id` (X) because one `game_id` takes place on exactly one date.

### Second normal form:
In this table the primary key is a superkey consisting of two columns: `game_id` and `player_id`. 

2NF is violated if any columns are functionally dependent on part of the primary key but not the entire primary key. This can only happen if the primary key is a superkey.

In [None]:
team_game = nba[['game_id', 'Team_Abbrev', 'H_A', 'Team_Score', 'Team_pace', 'Team_efg_pct', 
                 'Team_tov_pct', 'Team_orb_pct', 'Team_ft_rate', 'Team_off_rtg', 'Opponent_Abbrev']].drop_duplicates()

player_game = nba.drop(['Team_Abbrev', 'H_A', 'Team_Score', 'Team_pace', 'Team_efg_pct', 
                        'Team_tov_pct', 'Team_orb_pct', 'Team_ft_rate', 'Team_off_rtg', 'Opponent_Abbrev', 
                        'Opponent_Score', 'Opponent_pace', 'Opponent_efg_pct', 'Opponent_tov_pct', 
                        'Opponent_orb_pct', 'Opponent_ft_rate', 'Opponent_off_rtg'], axis=1)


In [None]:
player_game.columns = [x.lower().replace('%', '_pct') for x in player_game.columns]
team_game.columns = [x.lower().replace('%', '_pct') for x in team_game.columns]
players.columns = [x.lower().replace('%', '_pct') for x in players.columns]
games.columns = [x.lower().replace('%', '_pct') for x in games.columns]

### Third normal form:
3NF is violated if there are "transitive dependencies", that is, functional dependence between columns when neither column is part of the primary key.

In [None]:
dbms = 'postgresql'
connector = 'psycopg2'
user = 'postgres'
password = postgres_password
host = 'localhost'
port = '5432'
database = 'nba'



In [None]:
engine_string = f'{dbms}+{connector}://{user}:{password}@{host}:{port}/postgres'
engine = create_engine(engine_string)
with engine.connect() as conn:
    conn.execute(text("COMMIT"))
    conn.execute(text(f"CREATE DATABASE {database}"))
    conn.execute(text("COMMIT"))

# Update engine string to use the nba database
engine_string = f'{dbms}+{connector}://{user}:{password}@{host}:{port}/{database}'
engine = create_engine(engine_string)

In [None]:
player_game.to_sql('player_game', con=engine, chunksize=1000, if_exists='replace', index=False)
team_game.to_sql('team_game', con=engine, chunksize=1000, if_exists='replace', index=False)
players.to_sql('players', con=engine, chunksize=1000, if_exists='replace', index=False)
games.to_sql('games', con=engine, chunksize=1000, if_exists='replace', index=False)



In [None]:
myquery = 'SELECT * FROM games'
print(pd.read_sql_query(myquery, con=engine).head())