In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

## Summary

|Model| Comments|
|-----|---------|
| Simple LR w/o scaling| Test $R^2$ = 3.7%|
| Lasso w/o scaling| Test $R^2$ = 3.5%|
| Ridge w/o scaling| Test $R^2$ = 3.7%|
| ElasticNet w/o scaling| Test $R^2$ = 3.4%|
| LR with Minmax(0,1) scaling| Test $R^2$ = 3.7%|
| Lasso with Minmax(0,1) scaling| Test $R^2$ = 0.9%|
| Ridge with Minmax(0,1) scaling| Test $R^2$ = 3.6%|
| ElasticNet with Minmax(0,1) scaling| Test $R^2$ = 0.1%|
| LR with Standard scaling| Test $R^2$ = 3.7%|
| Lasso with Standard scaling| Test $R^2$ = 3.3%|
| Ridge with Standard scaling| Test $R^2$ = 3.7%|
| Lasso with Standard scaling and gridsearch| Test $R^2$ = 3.3%|
| Ridge with Standard scaling and gridsearch| Test $R^2$ = 3.8%|

> Simple LR or Ridge regression is best without feature selection or any data imputation

In [3]:
df = pd.read_csv("../data/train_data.zip")
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


In [19]:
biba_games_df = pd.DataFrame()
biba_games_df = pd.concat([df.iloc[:, 3:132],df.iloc[:, 673:793]], axis = 1)
biba_games_df['target'] = df['unacast_session_count']

In [20]:
biba_games_df.shape

(50120, 250)

In [21]:
# Removing categorical features
categorical_features = biba_games_df.loc[:, biba_games_df.dtypes == "object"]
biba_games_df = biba_games_df.drop(columns = list(categorical_features.columns))
biba_games_df.head()

Unnamed: 0,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_slide_single,monthly_count_climber_rope,monthly_count_slide_covered,...,historic_ws_8_to_10,historic_ws_10_to_12,historic_ws_12_to_14,historic_ws_14_to_16,historic_ws_above_16,historic_rain,historic_clear,historic_foggy,historic_snow,target
0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,...,,,,,,0.0,1.0,0.0,0.0,78.0
1,0,0,0,0.0,0.0,0.0,0.0,0,0,0,...,,,,,,0.0,1.0,0.0,0.0,111.0
2,0,0,0,0.0,0.0,0.0,0.0,0,0,0,...,,,,,,0.0,1.0,0.0,0.0,110.0
3,0,0,0,0.0,0.0,0.0,0.0,0,0,0,...,4.0,,,,,0.0,0.0,0.0,0.0,10.0
4,0,0,0,0.0,0.0,0.0,0.0,0,0,0,...,4.0,,,,,0.0,0.0,0.0,0.0,11.0


In [22]:
# Imputing with zero
impute_biba_games_df =  biba_games_df.fillna(0)

In [16]:
def show_scores(model, X, y, show = True):
    """
    Shows classification and regression scores
    
    Parameters
    ----------
    model: The sklearn model
    X: numpy.ndarray        
        The X part of the data
    y: numpy.ndarray
        The y part of the data
    Returns
    -------
        rmse: (float)
        r2: (float)
            
    """        
        
    y_preds = model.predict(X)                 
    rmse = mean_squared_error(y, y_preds, squared=False)
    r2 = r2_score(y, y_preds)
    if show: 
        print("Root mean squared error: %0.3f and r^2 score: %0.3f" % (rmse,r2))
    return rmse, r2

In [23]:
impute_biba_games_df.shape

(50120, 246)

## Modelling with all biba variables without any changes

In [38]:
X = impute_biba_games_df.drop(columns = ['target'])
y = impute_biba_games_df['target']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)

### 1. Linear Regression

In [26]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(lr, X_train, y_train)

print('Test error: ')    
show_scores(lr, X_test, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 457.009 and r^2 score: 0.084
Test error: 
Root mean squared error: 700.995 and r^2 score: 0.037


(700.9948925817952, 0.03668113635437975)

### Observations 

- Very poor model with $R^2 = 3.7 \%$

### 2. SVR

In [27]:
svr = SVR()
svr.fit(X_train, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(svr, X_train, y_train)

print('Test error: ')    
show_scores(svr, X_test, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 483.367 and r^2 score: -0.024
Test error: 
Root mean squared error: 719.283 and r^2 score: -0.014


(719.2832836305175, -0.01423896628865462)

### Observations
- Very long train runtime. Not feasible on whole dataset
- Very Poor model with negative $R^2$


### 3. Lasso L1

In [30]:
lasso_lr = Lasso(max_iter = 2000, random_state = 2020)
lasso_lr.fit(X_train, y_train)
print('Lasso regression scores: ')
print('Train error: ')
show_scores(lasso_lr, X_train, y_train)

print('Test error: ')
show_scores(lasso_lr, X_test, y_test)

Lasso regression scores: 
Train error: 
Root mean squared error: 458.633 and r^2 score: 0.078
Test error: 
Root mean squared error: 701.577 and r^2 score: 0.035


  positive)


(701.5773181102657, 0.03507971362012974)

### Observations
- Same performance as simple LR 

### 4. Ridge L2

In [32]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_test, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 457.029 and r^2 score: 0.084
Test error: 
Root mean squared error: 700.883 and r^2 score: 0.037


  overwrite_a=True).T


(700.8831442036662, 0.03698824484081753)

### Observations

- Same performance as Simple LR

### 4. ElasticNet L1 and L2 Regression

In [33]:
elastic_lr = ElasticNet(max_iter= 2000, random_state = 2020)
elastic_lr.fit(X_train, y_train)
print('Elastic regression scores: ')
print('Train error: ')
show_scores(elastic_lr, X_train, y_train)

print('Test error: ')    
show_scores(elastic_lr, X_test, y_test)

Elastic regression scores: 
Train error: 
Root mean squared error: 460.542 and r^2 score: 0.070
Test error: 
Root mean squared error: 702.101 and r^2 score: 0.034


  positive)


(702.100988553132, 0.03363870689591786)

### Observations

- Same performance as Simple LR

## Step 2: Scaling all columns with MinMaxScaler

In [40]:
scaler = MinMaxScaler((0,1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
print('Simple linear regression scores after scaling: ')
print('Train error: ')
show_scores(lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(lr, X_test_scaled, y_test)

Simple linear regression scores after scaling: 
Train error: 
Root mean squared error: 457.015 and r^2 score: 0.084
Test error: 
Root mean squared error: 701.012 and r^2 score: 0.037


(701.011928209274, 0.03663431449908394)

In [43]:
lasso_lr = Lasso(max_iter = 2000, random_state = 2020)
lasso_lr.fit(X_train_scaled, y_train)
print('Lasso regression scores: ')
print('Train error: ')
show_scores(lasso_lr, X_train_scaled, y_train)

print('Test error: ')
show_scores(lasso_lr, X_test_scaled, y_test)

Lasso regression scores: 
Train error: 
Root mean squared error: 470.296 and r^2 score: 0.030
Test error: 
Root mean squared error: 710.827 and r^2 score: 0.009


(710.827175190754, 0.009468245728442737)

In [44]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 458.814 and r^2 score: 0.077
Test error: 
Root mean squared error: 701.212 and r^2 score: 0.036


(701.2120412518988, 0.036084225272904)

In [45]:
elastic_lr = ElasticNet(max_iter= 2000, random_state = 2020)
elastic_lr.fit(X_train_scaled, y_train)
print('Elastic regression scores: ')
print('Train error: ')
show_scores(elastic_lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(elastic_lr, X_test_scaled, y_test)

Elastic regression scores: 
Train error: 
Root mean squared error: 476.868 and r^2 score: 0.003
Test error: 
Root mean squared error: 713.899 and r^2 score: 0.001


(713.8993048515317, 0.0008877694539464187)

### Observations:
- Simple LR - slight improvement from last case
- Lasso performs worse than previous case
- Ridge performs same as w/o scaling
- Elastic net performs worse than all models


### Step 3: Scaling all columns with Normalizer

In [49]:
scaler = Normalizer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [50]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
print('Simple linear regression scores after scaling: ')
print('Train error: ')
show_scores(lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(lr, X_test_scaled, y_test)

Simple linear regression scores after scaling: 
Train error: 
Root mean squared error: 461.569 and r^2 score: 0.066
Test error: 
Root mean squared error: 706.864 and r^2 score: 0.020


(706.8641109546479, 0.020482450309961098)

In [51]:
lasso_lr = Lasso(max_iter = 2000, random_state = 2020)
lasso_lr.fit(X_train_scaled, y_train)
print('Lasso regression scores: ')
print('Train error: ')
show_scores(lasso_lr, X_train_scaled, y_train)

print('Test error: ')
show_scores(lasso_lr, X_test_scaled, y_test)

Lasso regression scores: 
Train error: 
Root mean squared error: 476.227 and r^2 score: 0.006
Test error: 
Root mean squared error: 713.379 and r^2 score: 0.002


(713.3790699039246, 0.0023433913043583443)

In [52]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 475.323 and r^2 score: 0.009
Test error: 
Root mean squared error: 712.642 and r^2 score: 0.004


(712.6424719569329, 0.004402583802823989)

In [54]:
elastic_lr = ElasticNet(random_state = 2020)
elastic_lr.fit(X_train_scaled, y_train)
print('Elastic regression scores: ')
print('Train error: ')
show_scores(elastic_lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(elastic_lr, X_test_scaled, y_test)

Elastic regression scores: 
Train error: 
Root mean squared error: 477.369 and r^2 score: 0.001
Test error: 
Root mean squared error: 714.124 and r^2 score: 0.000


(714.1238491561965, 0.0002591647153790122)

### Observation
- All model performs worse than the minmax scaler

### Step 4: Scaling all columns with StandardScaler

In [56]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
print('Simple linear regression scores after scaling: ')
print('Train error: ')
show_scores(lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(lr, X_test_scaled, y_test)

Simple linear regression scores after scaling: 
Train error: 
Root mean squared error: 457.013 and r^2 score: 0.084
Test error: 
Root mean squared error: 701.017 and r^2 score: 0.037


(701.0170479980357, 0.03662024270765596)

In [58]:
lasso_lr = Lasso(max_iter = 2000, random_state = 2020)
lasso_lr.fit(X_train_scaled, y_train)
print('Lasso regression scores: ')
print('Train error: ')
show_scores(lasso_lr, X_train_scaled, y_train)

print('Test error: ')
show_scores(lasso_lr, X_test_scaled, y_test)

Lasso regression scores: 
Train error: 
Root mean squared error: 459.605 and r^2 score: 0.074
Test error: 
Root mean squared error: 702.504 and r^2 score: 0.033


(702.5041948820461, 0.03252845387281533)

In [59]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 457.019 and r^2 score: 0.084
Test error: 
Root mean squared error: 700.920 and r^2 score: 0.037


(700.9203403783774, 0.03688602721926115)

### Observation
- Standard Scaler works best for all models among other scalers
- Moving on to grid search

In [62]:
params = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100]}


In [61]:
lasso_lr = Lasso(max_iter = 2000, random_state = 2020)

clf = GridSearchCV(lasso_lr, params, cv =5)

clf.fit(X_train_scaled, y_train)
print('Lasso regression scores: ')
print('Train error: ')
show_scores(clf, X_train_scaled, y_train)

print('Test error: ') 
show_scores(clf, X_test_scaled, y_test)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Lasso regression scores: 
Train error: 
Root mean squared error: 459.605 and r^2 score: 0.074
Test error: 
Root mean squared error: 702.504 and r^2 score: 0.033


(702.5041948820461, 0.03252845387281533)

In [63]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)

clf_ridge = GridSearchCV(ridge_lr, params, cv =5)

clf_ridge.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(clf_ridge, X_train_scaled, y_train)

print('Test error: ')
show_scores(clf_ridge, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 457.326 and r^2 score: 0.083
Test error: 
Root mean squared error: 700.609 and r^2 score: 0.038


(700.609065459416, 0.037741264650077655)

In [64]:
clf.best_params_

{'alpha': 1}