# Exploratory Data Analysis Notebook

## Setup

In [1]:
# Imports
import sys
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder

if '..' not in sys.path:
    sys.path.insert(0, '..')
from src.loaders.config import Config
from src.loaders.database_helpers import get_engine

In [4]:
# Create config object
config = Config('../config')

# Database connection
url = 'sqlite+pysqlite:///../data/final/ranked_boardgames.sqlite3' 
# url = 'sqlite+pysqlite:///../data/final/boardgames.sqlite3'
db_connect = get_engine(url)

## Load Data

In [5]:
# Load tables to dataframes
df_games = pd.read_sql_table('game', db_connect, index_col='ID')
df_games_mech = pd.read_sql_table('game_mechanic', db_connect)
df_games_cat = pd.read_sql_table('game_category', db_connect)

## Transform Data

In [6]:
def encode_class(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """Encode game classifications with OHE and groupby game"""
    ohe = OneHotEncoder(cols=name, return_df=True)
    return ohe.fit_transform(df).groupby('GameID').agg(max)

In [7]:
# Encode mechanics
df_mech = encode_class(df_games_mech, 'MechanicID')

In [8]:
# Encode categories
df_cat = encode_class(df_games_cat, 'CategoryID')

In [9]:
# Merge dataframes into one
df = df_games.drop(columns=['Title', 'ReleaseYear', 'BayesRating'])
df = df.merge(df_mech, how='inner', left_index=True, right_on='GameID')
df = df.merge(df_cat, how='inner', left_index=True, right_on='GameID')

In [10]:
df

Unnamed: 0_level_0,AvgRating,TotalRatings,StdRatings,MinPlayers,MaxPlayers,MinPlaytime,MaxPlaytime,Weight,OwnedCopies,MechanicID_1,...,CategoryID_75,CategoryID_76,CategoryID_77,CategoryID_78,CategoryID_79,CategoryID_80,CategoryID_81,CategoryID_82,CategoryID_83,CategoryID_84
GameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.60903,5502,1.582160,3,5,240,240,4.3190,7708,0,...,0,0,0,0,0,0,0,0,0,0
2,6.63979,568,1.463000,3,4,30,30,1.9630,1317,0,...,0,0,0,0,0,0,0,0,0,0
3,7.45915,15507,1.186780,2,4,30,60,2.4823,15920,0,...,0,0,0,0,0,0,0,0,0,0
4,6.61971,348,1.241490,2,4,60,60,2.6667,654,0,...,0,0,0,0,0,0,0,0,0,0
5,7.33680,19103,1.337360,2,6,90,90,2.5006,24235,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362600,7.25648,54,0.991821,2,6,30,30,1.4000,129,0,...,0,0,0,0,0,0,0,0,0,0
363622,8.76610,100,1.713160,1,4,70,120,2.7778,305,0,...,0,0,0,0,0,0,0,0,0,0
364356,8.94667,30,1.752850,1,8,90,150,2.5714,32,0,...,0,0,0,0,0,0,0,0,0,0
365104,7.60772,159,1.042500,2,5,20,30,1.6250,631,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df['AvgRating'].mean()

6.47135760748827

In [16]:
X = df.drop(columns=['AvgRating', 'TotalRatings'])
y = df['AvgRating']

In [17]:
model = make_pipeline(
    StandardScaler(),
    LinearRegression()
)

In [18]:
model.fit(X, y)
y_pred = model.predict(X)



In [19]:
model.score(X, y)



0.4961503991051246