## Anova and Correlation

### Imports

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from statsmodels.formula.api import ols

### Read Data

In [2]:
INPUT_FILE = '../../data/processed/processed_data.csv'

df = pd.read_parquet(INPUT_FILE)

### Prepare the data

In [3]:
target_col = 'copiesSold'

X = df.drop(columns=[target_col, 'appid'])
y = df[target_col]

numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

### Correlation with Numerical Features

In [4]:
numeric_corr = df[numerical_cols.tolist() + [target_col]].corr()
correlations = numeric_corr[target_col].drop(target_col).sort_values(key=abs, ascending=False)
correlations.head(10)

publisherClass_Indie    0.420674
publisherClass_Other    0.353768
has_metacritic          0.345550
genre_Free To Play      0.206684
steam_trading_cards     0.179513
steam_achievements      0.175463
has_dlc                 0.167505
dlc_count               0.167505
genre_Casual           -0.133536
is_sequel               0.097095
Name: copiesSold, dtype: float64

### ANOVA for Categorical Features

In [5]:
anova_results = []

for col in categorical_cols:
    if 1 < df[col].nunique() <= 100: # choose categorical features with 2-100 unique values
        model = ols(f'{target_col} ~ C({col})', data=df[[col, target_col]]).fit()
        p_value = sm.stats.anova_lm(model, typ=2).iloc[0]['PR(>F)']
        anova_results.append((col, p_value))

anova_df = pd.DataFrame(anova_results, columns=["Feature", "P-Value"]).sort_values("P-Value")
anova_df.head()

Unnamed: 0,Feature,P-Value
0,supported_platforms,6.974636e-125


### Select Top 50 Features

In [6]:
top_numeric = correlations.head(49).index.tolist()
top_categorical = anova_df.head(1)["Feature"].tolist() # only one categorical feature
selected_features = top_numeric + top_categorical
selected_features

['publisherClass_Indie',
 'publisherClass_Other',
 'has_metacritic',
 'genre_Free To Play',
 'steam_trading_cards',
 'steam_achievements',
 'has_dlc',
 'dlc_count',
 'genre_Casual',
 'is_sequel',
 'is_upcoming',
 'genre_Simulation',
 'genre_Massively Multiplayer',
 'genre_RPG',
 'reviewScore',
 'is_release_date_known',
 'price',
 'achievements_total',
 'name_words',
 'workshop_support',
 'year',
 'genre_Indie',
 'name_len',
 'mac',
 'metacritic_preprocessed',
 'genre_Early Access',
 'cos_day',
 'genre_Adventure',
 'name_has_edition',
 'genre_Strategy',
 'name_has_vr',
 'name_cap_ratio',
 'genre_Racing',
 'name_has_collection',
 'has_demo',
 'demo_count',
 'genre_Nudity',
 'linux',
 'name_has_remaster',
 'name_has_collector',
 'genre_Sexual Content',
 'genre_Sports',
 'genre_Gore',
 'genre_Game Development',
 'genre_Action',
 'windows',
 'genre_Design & Illustration',
 'genre_Audio Production',
 'sin_day',
 'supported_platforms']

### Split to train and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Transformer and Pipeline

In [8]:
results = []

transformer = ColumnTransformer(
    transformers=[ 
        ('num', 'passthrough', []),  
        ('cat', OneHotEncoder(), [])
    ])

pipeline = Pipeline(steps=[ 
    ('preprocessor', transformer),
    ('regressor', LinearRegression())
])

### Evaluate Feature Combinations with addition of features

In [9]:
for i in range(1, len(selected_features) + 1):
    feature_subset = selected_features[:i]
    
    # update the transformer with the current subset of features
    num_features = [feat for feat in feature_subset if feat in numerical_cols]
    cat_features = [feat for feat in feature_subset if feat in categorical_cols]
    transformer.transformers[0] = ('num', 'passthrough', num_features)
    transformer.transformers[1] = ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)

    pipeline.fit(X_train[feature_subset], y_train)
    y_pred = pipeline.predict(X_test[feature_subset])
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append((i, feature_subset, mse, r2))

results_df = pd.DataFrame(results, columns=['Number of Features', 'Features', 'MSE', 'R²'])

results_df_sorted = results_df.sort_values(by=['R²', 'MSE'], ascending=[False, True]) # higher R² and lower MSE are better

print(results_df_sorted)


    Number of Features                                           Features  \
49                  50  [publisherClass_Indie, publisherClass_Other, h...   
48                  49  [publisherClass_Indie, publisherClass_Other, h...   
44                  45  [publisherClass_Indie, publisherClass_Other, h...   
46                  47  [publisherClass_Indie, publisherClass_Other, h...   
43                  44  [publisherClass_Indie, publisherClass_Other, h...   
42                  43  [publisherClass_Indie, publisherClass_Other, h...   
47                  48  [publisherClass_Indie, publisherClass_Other, h...   
45                  46  [publisherClass_Indie, publisherClass_Other, h...   
41                  42  [publisherClass_Indie, publisherClass_Other, h...   
39                  40  [publisherClass_Indie, publisherClass_Other, h...   
40                  41  [publisherClass_Indie, publisherClass_Other, h...   
37                  38  [publisherClass_Indie, publisherClass_Other, h...   