In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
df = pd.read_csv('../data/in_process/Popular_Games_expanded.csv', index_col = 'game_id')
dfgood = pd.read_csv('../data/in_process/Popular_Games.csv', index_col = 'game_id')

In [18]:
df.describe()

Unnamed: 0,min_players,max_players,avg_time,year,avg_rating,geek_rating,num_votes,age,owned,weight
count,4768.0,4768.0,4768.0,4768.0,4768.0,4768.0,4768.0,4768.0,4768.0,4768.0
mean,1.998951,5.466862,82.654308,1996.456586,6.950445,6.245841,3084.935612,10.657508,5277.25755,2.273101
std,0.699575,7.197982,246.712028,175.123777,0.771389,0.53975,6084.947744,2.741033,8597.807335,0.77621
min,0.0,0.0,1.0,-3500.0,2.26623,3.52609,500.0,0.0,263.0,1.0
25%,2.0,4.0,30.0,2004.0,6.485913,5.874825,750.0,8.0,1616.75,1.6734
50%,2.0,5.0,60.0,2011.0,6.96451,6.161935,1238.5,10.0,2778.5,2.21745
75%,2.0,6.0,90.0,2015.0,7.485098,6.550277,2663.0,13.0,5382.0,2.7863
max,8.0,100.0,12000.0,2020.0,9.1869,8.57686,93524.0,18.0,139881.0,4.7233


##Observations
Number of votes highly correlated with number of users who own the game. Makes sense.
Number of votes also associated with "geek rating." Makes sense because Geek Rating is a Bayesian average.
Number owned is correlated with both ratings. Makes sense that people would vote for stuff they own.
Average playing time and "weight" are correlated. Makes sense. Longer games are weightier.
Recommended age and wieght are correlated because games for kids are less weighty.


In [19]:
#Dropping irrelevant games
df = df.drop([18291, 21804, 23953], axis = 0)

In [20]:
#looking up these two on the web site to get correct player counts
df.loc[4149, 'max_players'] = 1
df.loc[25738, 'max_players'] = 4
df.loc[177497, 'max_players'] = 4
df.loc[177497, 'min_players'] = 1

In [21]:
#just drop them if we want minimum age to be a relevant factor
dfage=df[df['age'] == 0]
dfage = dfage[['age', 'weight']]
df = df.drop(dfage.index, axis = 0)

In [22]:
#limit examination to only more modern games
#will define that as games released since 1960
dfy = pd.DataFrame(df['year'])
dfy2 = dfy[dfy['year'] < 1960]
df = df.drop(dfy2.index, axis = 0)
df.describe()

Unnamed: 0,min_players,max_players,avg_time,year,avg_rating,geek_rating,num_votes,age,owned,weight
count,4630.0,4630.0,4630.0,4630.0,4630.0,4630.0,4630.0,4630.0,4630.0,4630.0
mean,1.999136,5.457451,83.016465,2008.392009,6.966894,6.258314,3087.955508,10.858963,5288.644276,2.278022
std,0.699658,7.024371,250.067805,10.061053,0.739821,0.529486,6112.668181,2.420251,8625.648781,0.774208
min,1.0,1.0,1.0,1960.0,3.33177,4.27258,500.0,2.0,263.0,1.0
25%,2.0,4.0,30.0,2005.0,6.496457,5.880187,751.0,9.0,1634.0,1.6801
50%,2.0,5.0,60.0,2011.0,6.97108,6.16929,1249.0,11.0,2794.0,2.2222
75%,2.0,6.0,90.0,2015.0,7.487145,6.560507,2679.75,13.0,5402.75,2.787725
max,8.0,100.0,12000.0,2020.0,9.1869,8.57686,93524.0,18.0,139881.0,4.7233


### Observations

##### Number of votes highly correlated with number of users who own the game. Makes sense.
##### Number of votes also associated with "geek rating." Makes sense because Geek Rating is a Bayesian average.
##### Number owned is correlated with both ratings. Makes sense that people would vote for stuff they own.
##### Average playing time and "weight" are correlated. Makes sense. Longer games are weightier.
##### Recommended age and wieght are correlated because games for kids are less weighty.
#####  Average rating and min age may be correlated. Do people like games that can be played with their kids?

In [23]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing

In [24]:
df = df.drop(['num_votes', 'owned'], axis = 1)


In [25]:
#dropping expansions and just looking at original games
df = df[df['expands'] == 'Nothing']

In [26]:
df = df.drop(['names', 'designer', 'publisher', 'expands'], axis = 1)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3836 entries, 1 to 287954
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   min_players  3836 non-null   int64  
 1   max_players  3836 non-null   int64  
 2   avg_time     3836 non-null   float64
 3   year         3836 non-null   int64  
 4   avg_rating   3836 non-null   float64
 5   geek_rating  3836 non-null   float64
 6   age          3836 non-null   int64  
 7   mechanic     3836 non-null   object 
 8   category     3836 non-null   object 
 9   weight       3836 non-null   float64
 10  Category_1   3836 non-null   object 
 11  Category_2   3214 non-null   object 
 12  Category_3   2111 non-null   object 
 13  Category_4   1098 non-null   object 
 14  Mechanic_1   3836 non-null   object 
 15  Mechanic_2   3355 non-null   object 
 16  Mechanic_3   2595 non-null   object 
 17  Mechanic_4   1789 non-null   object 
 18  Mechanic_5   1135 non-null   object 
dtypes: f

In [28]:
dfexpandedC = df.iloc[:, 10:14]

In [29]:
dfexpandedM = df.iloc[:, 14:]

In [30]:
dfexpandedM.head()

Unnamed: 0_level_0,Mechanic_1,Mechanic_2,Mechanic_3,Mechanic_4,Mechanic_5
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,Simultaneous Action Selection
2,Trick-taking,,,,
3,Area Majority / Influence,Hand Management,Set Collection,Tile Placement,
5,Hand Management,Investment,Market,Ownership,Stock Holding
7,Enclosure,Pattern Building,Pattern Recognition,Tile Placement,


In [31]:
#This just gives us a list of all categories
Cats1 = df['category'].str.split(pat=', ', expand=False).tolist()
Cats = list()
for i in range(len(Cats1)):
    for j in range(len(Cats1[i])):
        Cats.append(Cats1[i][j])

In [32]:
#this gives us a list of all mechanics
Mech1 = df['mechanic'].str.split(pat=', ', expand=False).tolist()
Mech = list()
for i in range(len(Mech1)):
    for j in range(len(Mech1[i])):
        Mech.append(Mech1[i][j])

In [33]:
Cat_counts = dfexpandedC.apply(pd.Series.value_counts).sum(axis=1)

In [34]:
Mech_counts = dfexpandedM.apply(pd.Series.value_counts).sum(axis = 1)

In [35]:
df_C = pd.DataFrame()
for catg in range(len(Cats)):
    df_C[Cats[catg]] = df['category'].str.contains(Cats[catg]).astype(int)

In [36]:
for i in df_C.columns:
    df_C[i] = df_C[i]*Cat_counts[i]/len(Cats)

In [37]:
df_M = pd.DataFrame()
for mechs in range(len(Mech)):
    df_M[Mech[mechs]] = df['mechanic'].str.contains(Mech[mechs]).astype(int)


In [38]:
#When this was first run, df_M has a column "order counters"
#This value was not in the counts. It must not have been one of the first 5
#Mechanics listed in any one row. Rather than go back and look for it, I'm going
#to just drop it.
df_M = df_M.drop(['Order Counters'], axis = 1)

In [39]:
for j in df_M.columns:
    df_M[j] = df_M[j]*Mech_counts[j]/len(Mech)

In [40]:
df_M.head()

Unnamed: 0_level_0,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,Simultaneous Action Selection,Trick-taking,Set Collection,Tile Placement,Investment,Market,...,Automatic Resource Growth,Prisoner's Dilemma,Narrative Choice / Paragraph,Contracts,Moving Multiple Units,King of the Hill,Force Commitment,Legacy Game,Bingo,Pattern Movement
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.039385,0.021578,0.073673,0.080517,0.017737,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.003911,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.039385,0.0,0.0,0.080517,0.0,0.0,0.048464,0.027863,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.080517,0.0,0.0,0.0,0.027863,0.000698,0.001466,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027863,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculated frequencies for all categories and Mechanics.
Need to take the average of each row and then merge it into the main frame.

In [41]:
#with the above columns for each element of Category and Mechanic
#we don't need the split out columns
df = df.drop(['Category_1', 'Category_2', 'Category_3', 'Category_4'], axis = 1)
df = df.drop(['Mechanic_1', 'Mechanic_2', 'Mechanic_3', 'Mechanic_4', 'Mechanic_5'], axis = 1)

In [42]:
df = df.drop(['mechanic', 'category'], axis = 1)

In [52]:
df_C = df_C.replace(0, np.NaN)

In [53]:
df_M = df_M.replace(0, np.NaN)

In [56]:
df['Category_average'] = df_C.mean(axis=1)

In [57]:
df['Mechanic_average'] = df_M.mean(axis=1)

In [59]:
df.describe()

Unnamed: 0,min_players,max_players,avg_time,year,avg_rating,geek_rating,age,weight,Category_average,Mechanic_average
count,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0,3836.0
mean,2.010688,5.436131,82.510167,2007.887904,6.834371,6.222357,10.722888,2.217682,0.031586,0.028713
std,0.681773,7.278325,273.188526,10.636066,0.698512,0.539138,2.427055,0.782007,0.02214,0.015334
min,1.0,1.0,1.0,1960.0,3.33177,4.27258,2.0,1.0,0.001537,0.00014
25%,2.0,4.0,30.0,2004.0,6.42188,5.844862,8.0,1.59,0.016214,0.017873
50%,2.0,5.0,60.0,2011.0,6.848635,6.10975,10.0,2.1509,0.024852,0.027807
75%,2.0,6.0,90.0,2015.0,7.300432,6.523722,12.0,2.73065,0.042138,0.037182
max,8.0,100.0,12000.0,2020.0,9.1869,8.57686,18.0,4.7233,0.108328,0.080517


In [61]:
#Using OLS without train/test splitting just to get an overall
#view of what a regression would look like.
X = df.drop(['avg_rating', 'geek_rating'], axis = 1)
y1 = df['avg_rating']
y2 = df['geek_rating']
X = sm.add_constant(X)

In [62]:
#Basic linear model on the average user rating
rModel1 = sm.OLS(y1, X)
rModel1_result = rModel1.fit()
rModel1_result.summary()

0,1,2,3
Dep. Variable:,avg_rating,R-squared:,0.492
Model:,OLS,Adj. R-squared:,0.491
Method:,Least Squares,F-statistic:,463.6
Date:,"Thu, 05 Aug 2021",Prob (F-statistic):,0.0
Time:,21:19:27,Log-Likelihood:,-2766.5
No. Observations:,3836,AIC:,5551.0
Df Residuals:,3827,BIC:,5607.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-44.6776,1.575,-28.371,0.000,-47.765,-41.590
min_players,-0.0382,0.012,-3.107,0.002,-0.062,-0.014
max_players,-0.0006,0.001,-0.499,0.618,-0.003,0.002
avg_time,4.793e-05,3.08e-05,1.557,0.119,-1.24e-05,0.000
year,0.0252,0.001,32.045,0.000,0.024,0.027
age,0.0053,0.004,1.302,0.193,-0.003,0.013
weight,0.4617,0.013,34.433,0.000,0.435,0.488
Category_average,0.3332,0.385,0.866,0.387,-0.421,1.087
Mechanic_average,-4.0208,0.541,-7.433,0.000,-5.081,-2.960

0,1,2,3
Omnibus:,186.843,Durbin-Watson:,1.727
Prob(Omnibus):,0.0,Jarque-Bera (JB):,310.582
Skew:,-0.402,Prob(JB):,3.61e-68
Kurtosis:,4.138,Cond. No.,393000.0


In [63]:
#Basic linear model on the geek rating
rModel2 = sm.OLS(y2, X)
rModel2_result = rModel2.fit()
rModel2_result.summary()

0,1,2,3
Dep. Variable:,geek_rating,R-squared:,0.286
Model:,OLS,Adj. R-squared:,0.285
Method:,Least Squares,F-statistic:,191.6
Date:,"Thu, 05 Aug 2021",Prob (F-statistic):,2.9000000000000003e-273
Time:,21:19:32,Log-Likelihood:,-2426.5
No. Observations:,3836,AIC:,4871.0
Df Residuals:,3827,BIC:,4927.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-21.4666,1.441,-14.895,0.000,-24.292,-18.641
min_players,-0.0081,0.011,-0.722,0.470,-0.030,0.014
max_players,-0.0002,0.001,-0.201,0.840,-0.002,0.002
avg_time,-5.604e-05,2.82e-05,-1.990,0.047,-0.000,-8.23e-07
year,0.0135,0.001,18.788,0.000,0.012,0.015
age,-0.0012,0.004,-0.316,0.752,-0.008,0.006
weight,0.2979,0.012,24.274,0.000,0.274,0.322
Category_average,0.4646,0.352,1.320,0.187,-0.226,1.155
Mechanic_average,-3.7856,0.495,-7.647,0.000,-4.756,-2.815

0,1,2,3
Omnibus:,301.916,Durbin-Watson:,1.916
Prob(Omnibus):,0.0,Jarque-Bera (JB):,374.984
Skew:,0.749,Prob(JB):,3.74e-82
Kurtosis:,3.321,Cond. No.,393000.0


In [64]:
df.to_csv('../data/in_process/Games_FreqEncodedCorrectly.csv')
