# Score & Feature Analysis

In [110]:
# import libraries 

import pandas as pd 
import numpy as np 
import statsmodels.api as sm
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

In [89]:
# import dataset 
sent = pd.read_csv(R'C:\Users\Admin\Documents\GitHub\metacritic_text_analytics\data\Sentiment_Analysis_Per_Game_Per_attribute.csv')
sent = sent.loc[:, ~sent.columns.str.contains('^Unnamed')]
sent = sent.fillna(0) # fill NaN with 0

sent_1 = pd.read_csv(R'C:\Users\Admin\Documents\GitHub\metacritic_text_analytics\data\reviews_sentiment.csv')
sent_1 = sent_1.loc[:, ~sent_1.columns.str.contains('^Unnamed')]


sent_2= pd.read_csv(R'C:\Users\Admin\Documents\GitHub\metacritic_text_analytics\data\Sentiment_Analysis_Per_Review_Per_Attribute.csv')
sent_2 = sent_2.loc[:, ~sent_2.columns.str.contains('^Unnamed')]
sent_2 = sent_2.fillna(0) # fill NaN with 0

game_list= pd.read_csv(R'C:\Users\Admin\Documents\GitHub\metacritic_text_analytics\data\game_list.csv')



## Building Multivariate Regression Model (Standardized Data)

In [96]:
# Running basic OLS on per game per attribute csv

scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(sent.drop(['game_id', 'game_name', 'sound track'], axis =1)))
scaled_data.columns = sent.drop(['game_id', 'game_name', 'sound track'], axis =1).columns


X = scaled_data.drop('score', axis =1)
y = scaled_data['score']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.081
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     1.674
Date:                Sat, 12 Feb 2022   Prob (F-statistic):             0.0505
Time:                        15:35:23   Log-Likelihood:                -441.93
No. Observations:                 321   AIC:                             917.9
Df Residuals:                     304   BIC:                             982.0
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                   -1

  x = pd.concat(x[::order], 1)


In [97]:
# Running basic OLS on sentiment_value and score

scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(sent_1[['sentiment_value', 'score']]))
scaled_data.columns = ['sentiment_value', 'score']

X = scaled_data.drop('score', axis =1)
y = scaled_data['score']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print(est.summary())


                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                  0.088
Method:                 Least Squares   F-statistic:                     532.9
Date:                Sat, 12 Feb 2022   Prob (F-statistic):          1.26e-112
Time:                        15:36:32   Log-Likelihood:                -7559.7
No. Observations:                5507   AIC:                         1.512e+04
Df Residuals:                    5505   BIC:                         1.514e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const           -7.112e-16      0.013  -5.

  x = pd.concat(x[::order], 1)


In [100]:
# Running basic OLS on per review per attribute csv
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(sent_2))
scaled_data.columns = sent_2.columns


X = scaled_data.drop('score', axis =1)
y = scaled_data['score']
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
print(est.summary())



                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     17.83
Date:                Sat, 12 Feb 2022   Prob (F-statistic):           1.00e-52
Time:                        15:37:51   Log-Likelihood:                -7666.1
No. Observations:                5507   AIC:                         1.537e+04
Df Residuals:                    5489   BIC:                         1.549e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                   -7

  x = pd.concat(x[::order], 1)


## Training Gradient Boosting Algorithm

In [151]:
X = sent.drop(['game_id', 'game_name', 'score'], axis =1)
y = sent['score']

parameters = {'n_estimators':[100,250,500], 
              'max_depth':[3,5,10]}
              
gbr = GradientBoostingRegressor()

clf = GridSearchCV(gbr, parameters)
results = clf.fit(X,y)


# Formatting results table
df = {}
for coef, coef_val in zip(X.columns, results.best_estimator_.feature_importances_):
    df[coef]= coef_val

feature_importance = pd.DataFrame(df, index = [0]).T.sort_values(by = 0, ascending =False)
feature_importance.rename(columns = {0:'Feature Importance'}, inplace = True)
feature_importance

Unnamed: 0,Feature Importance
ignore,0.174896
value,0.109737
strategy based gameplay,0.099623
skill based gameplay,0.093331
technical performance,0.085061
difficulty,0.070962
innovative,0.070217
game design,0.067191
playthrough time,0.065386
enjoyment,0.050741


In [152]:
# Training Grading Boosting Regression Model on per review per attribute csv
X = sent_2.drop('score', axis = 1)
y = sent_2['score']

parameters = {'n_estimators':[100,250,500], 
              'max_depth':[3,5,10]}
              
gbr = GradientBoostingRegressor()

clf = GridSearchCV(gbr, parameters)
results = clf.fit(X,y)


# Formatting results table
df = {}
for coef, coef_val in zip(X.columns, results.best_estimator_.feature_importances_):
    df[coef]= coef_val

feature_importance = pd.DataFrame(df, index = [0]).T.sort_values(by = 0, ascending =False)
feature_importance.rename(columns = {0:'Feature Importance'}, inplace = True)
feature_importance

Unnamed: 0,Feature Importance
game design,0.220935
ignore,0.174158
technical performance,0.115011
enjoyment,0.104216
narrative,0.099629
strategy based gameplay,0.051653
playthrough time,0.040416
difficulty,0.032548
skill based gameplay,0.030845
innovative,0.02823
