In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from lightgbm.sklearn import LGBMRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

import preprocessing as pp

## Summary

|Model| Comments|
|-----|---------|
| Simple LR w/o scaling| Test $R^2$ = 4.1%|
| Lasso w/o scaling| Test $R^2$ = 3.5%|
| Ridge w/o scaling| Test $R^2$ = 3.7%|
| ElasticNet w/o scaling| Test $R^2$ = 3.4%|
| LR with Minmax(0,1) scaling| Test $R^2$ = 3.7%|
| Lasso with Minmax(0,1) scaling| Test $R^2$ = 0.9%|
| Ridge with Minmax(0,1) scaling| Test $R^2$ = 3.6%|
| ElasticNet with Minmax(0,1) scaling| Test $R^2$ = 0.1%|
| LR with Standard scaling| Test $R^2$ = 4.0%|
| Lasso with Standard scaling| Test $R^2$ = 3.3%|
| Ridge with Standard scaling| Test $R^2$ = 3.7%|
| Lasso with Standard scaling and gridsearch| Test $R^2$ = 3.3%|
| Ridge with Standard scaling and gridsearch| Test $R^2$ = 4.2%|
| Random Forest with Standard scaling | Test $R^2$ = 28.5%|


> Simple LR or Ridge regression is best without feature selection or any data imputation
> PCA will have reduced the number of columns from 260 to 190 with 99% variance.
> Random forest works better than others and can be improved with gridsearch but will take lot of time to train.


In [2]:
df = pd.read_csv("../data/train_data.zip")

In [3]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


## Function for pre-processing data

In [4]:
clean_df = pp.preprocessing_na(df)
clean_df.shape

(50120, 815)

In [5]:
clean_df.head()

Unnamed: 0,external_id,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,...,monthly_regular,monthly_accessible,monthly_variety,monthly_Monday,monthly_Friday,monthly_Thursday,monthly_Sunday,monthly_Tuesday,monthly_Saturday,monthly_Wednesday
0,1900203,3,2019,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1900203,6,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1900203,8,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MR00101775,1,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MR00101775,8,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
clean_df2 = pp.clean_categorical(clean_df.drop(columns = ['external_id', 'state']))

In [7]:
clean_df2.shape

(50120, 819)

In [8]:
def show_scores(model, X, y, show = True):
    """
    Shows classification and regression scores
    
    Parameters
    ----------
    model: The sklearn model
    X: numpy.ndarray        
        The X part of the data
    y: numpy.ndarray
        The y part of the data
    Returns
    -------
        rmse: (float)
        r2: (float)
            
    """        
        
    y_preds = model.predict(X)                 
    rmse = mean_squared_error(y, y_preds, squared=False)
    r2 = r2_score(y, y_preds)
    if show: 
        print("Root mean squared error: %0.3f and r^2 score: %0.3f" % (rmse,r2))
    return rmse, r2

## Modelling with Linear regression

In [9]:
X = clean_df2.drop(columns = ['unacast_session_count'])
y = clean_df2['unacast_session_count']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)

### 1. Linear Regression

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(lr, X_train, y_train)

print('Test error: ')    
show_scores(lr, X_test, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 412.405 and r^2 score: 0.254
Test error: 
Root mean squared error: 668.082 and r^2 score: 0.125


(668.0819799558959, 0.12501648107849783)

### Observations 

- Very poor model with $R^2 = 14.1 \%$

### 2. SVR

### Observations
- Very long train runtime. Not feasible on whole dataset
- Very Poor model with negative $R^2$


### 3. Lasso L1

### Observations
- Same performance as simple LR 

### 4. Ridge L2

In [14]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_test, y_test)

  overwrite_a=True).T


Ridge regression scores: 
Train error: 
Root mean squared error: 412.451 and r^2 score: 0.254
Test error: 
Root mean squared error: 667.956 and r^2 score: 0.125


(667.9557472618134, 0.12534710245549097)

### Observations

- Same performance as Simple LR

### 4. ElasticNet L1 and L2 Regression

### Observations

- Same performance as Simple LR
- Too slow

## Step 2: Scaling all columns with MinMaxScaler

### Observations:
- Simple LR - slight improvement from last case
- Lasso performs worse than previous case
- Ridge performs same as w/o scaling
- Elastic net performs worse than all models


### Step 3: Scaling all columns with Normalizer

### Observation
- All model performs worse than the minmax scaler

### Step 4: Scaling all columns with StandardScaler

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Observation
- Standard Scaler works best for all models among other scalers
- Moving on to grid search

In [16]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(lr, X_test_scaled, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 412.408 and r^2 score: 0.254
Test error: 
Root mean squared error: 668.081 and r^2 score: 0.125


(668.0809885995218, 0.12501907782521782)

In [17]:
params = {'alpha' : [0.001, 0.1, 1, 10, 100]}


In [18]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)

clf_ridge = GridSearchCV(ridge_lr, params, cv =5)

clf_ridge.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(clf_ridge, X_train_scaled, y_train)

print('Test error: ')
show_scores(clf_ridge, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 413.193 and r^2 score: 0.251
Test error: 
Root mean squared error: 668.456 and r^2 score: 0.124


(668.4563791374494, 0.12403550893358262)

In [19]:
clf_ridge.best_params_

{'alpha': 100}

## LGBM on this data

In [20]:
lgbm = LGBMRegressor()

lgbm.fit(X_train_scaled, y_train)

print('LGBM scores: ')

print('Train error: ')
show_scores(lgbm, X_train_scaled, y_train)

print('Test error: ')
show_scores(lgbm, X_test_scaled, y_test)

LGBM scores: 
Train error: 
Root mean squared error: 260.506 and r^2 score: 0.702
Test error: 
Root mean squared error: 596.266 and r^2 score: 0.303


(596.266345373865, 0.3030189791757213)

In [23]:
#Including state column in OHE

clean_df2 = pp.clean_categorical(clean_df.drop(columns = ['external_id']), to_drop= ['income_class', 'density_class', 'climate', 'state'])
print(clean_df2.shape)
X = clean_df2.drop(columns = ['unacast_session_count'])
y = clean_df2['unacast_session_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(50120, 870)


In [24]:
lgbm = LGBMRegressor()

lgbm.fit(X_train_scaled, y_train)

print('LGBM scores: ')

print('Train error: ')
show_scores(lgbm, X_train_scaled, y_train)

print('Test error: ')
show_scores(lgbm, X_test_scaled, y_test)

LGBM scores: 
Train error: 
Root mean squared error: 256.634 and r^2 score: 0.711
Test error: 
Root mean squared error: 594.535 and r^2 score: 0.307


(594.5348319879421, 0.3070610643989915)

## Observation 
- no improvement with state columns

In [26]:
clean_df2 = pp.clean_categorical(clean_df.drop(columns = ['external_id', 'state']))
print(clean_df2.shape)
X = clean_df2.drop(columns = ['unacast_session_count'])
y = clean_df2['unacast_session_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(50120, 819)


In [25]:
params = {'learning_rate' : [0.001, 0.1, 1, 10, 100], 'max_depth' : [100, 300, 500], 'n_estimators' : [100, 500, 1000] }

lgbm = LGBMRegressor(random_state = 2020)

clf_lgbm = GridSearchCV(lgbm, params, cv =5)

clf_lgbm.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(clf_lgbm, X_train_scaled, y_train)

print('Test error: ')
show_scores(clf_lgbm, X_test_scaled, y_test)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\saura\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-cf0602da4a1c>", line 7, in <module>
    clf_lgbm.fit(X_train_scaled, y_train)
  File "C:\Users\saura\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 710, in fit
    self._run_search(evaluate_candidates)
  File "C:\Users\saura\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 1151, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "C:\Users\saura\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 689, in evaluate_candidates
    cv.split(X, y, groups)))
  File "C:\Users\saura\Anaconda3\lib\site-packages\joblib\parallel.py", line 1004, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\saura\Anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_

KeyboardInterrupt: 