In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from lightgbm.sklearn import LGBMRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

# import other functions
from imputer import *
from feature_eng import *
from drop import *

## Summary

|Model| Test MAE| Test MSE|
|-----|---------|---------|
| Simple LR | 
| Ridge with gridsearch|
| Simple LGBM | 
| LGBM with gridsearch | 
| LGBM with log transformed y | 


**Comments -** 


In [2]:
df = pd.read_csv("../data/train_data.zip")

In [3]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


## Pre-processing data

In [4]:
# Remove playgrounds with 'external_id' == 'CA00070678'
df = df[df['external_id'] != 'CA00070678']

In [5]:
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)
# perform OHE (climate, density_class, income_class)
X_train = clean_categorical(X_train)
X_valid = clean_categorical(X_valid)

In [6]:
def show_scores(model, X, y, error = 'mse'):
    """
    Shows the mean squared error and mean absolute error for a given model
    and predictors and response
    
    Parameters
    ----------
    model: The sklearn model object
    X: numpy.ndarray        
        The predictors(independent variables) part of the data
    y: numpy.ndarray
        The response(target variable)of the data
    error: string,
        'mse' or 'mae' depending upon the type of error
        we are interested in
        
    Returns
    ------- 
    """            
    y_preds = model.predict(X)
    
    if error == 'mse':
        rmse = mean_squared_error(y, y_preds)
        print("Root mean squared error: %0.3f" % rmse)
    if error == 'mae':
        mae = mean_absolute_error(y, y_preds)
        print("Mean absolute error: %0.3f" % mae)
    else:
        print("Wrong choice")

## Modelling with Linear regression

In [7]:
print(X_train.shape)
print(X_valid.shape)

(40080, 632)
(10020, 632)


In [27]:
# counts = X_train.describe().loc['count', :]
X_train.fillna(0, inplace = True)

### 1. Linear Regression

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(lr, X_train, y_train)

print('Test error: ')    
show_scores(lr, X_valid, y_valid)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Observations 



### 4. Ridge L2

In [25]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_valid, y_valid)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Observations


In [None]:
params = {'alpha' : [0.001, 0.1, 1, 10, 100]}

In [None]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)

clf_ridge = GridSearchCV(ridge_lr, params, cv =5)

clf_ridge.fit(X_train, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(clf_ridge, X_train, y_train)

print('Test error: ')
show_scores(clf_ridge, X_valid, y_valid)

In [None]:
clf_ridge.best_params_

## LGBM on this data with Mean objective function

In [None]:
lgbm = LGBMRegressor(random_state = 2020)

lgbm.fit(X_train, y_train)

print('LGBM scores: ')

print('Train error: ')
show_scores(lgbm, X_train, y_train)

print('Test error: ')
show_scores(lgbm, X_valid, y_valid)

**performing grid search**

In [None]:
params = {'learning_rate' : [0.01, 0.1, 1, 10, 100], 'max_depth' : [100, 500], 'n_estimators' : [100, 500] }

lgbm = LGBMRegressor(random_state = 2020)

clf_lgbm = GridSearchCV(lgbm, params, cv =5)

clf_lgbm.fit(X_train, y_train)
print('LGBM scores: ')
print('Train error: ')
show_scores(clf_lgbm, X_train, y_train)

print('Test error: ')
show_scores(clf_lgbm, X_valid, y_valid)

In [None]:
clf_lgbm.best_params_

## LGBM on this data with Median objective function

In [None]:
#fitting lgbm with MAE without scaling
lgbm = LGBMRegressor(objective = 'mae', random_state = 2020)

lgbm.fit(X_train, y_train)
print('LGBM scores: ')
print('Train error: ')
show_scores(lgbm, X_train, y_train)

print('Test error: ')
show_scores(lgbm, X_valid, y_valid)

In [None]:
#fitting lgbm with MAE with scaling
lgbm = LGBMRegressor(objective = 'mae', random_state = 2020)

lgbm.fit(X_train, y_train)
print('LGBM regression scores: ')
print('Train error: ')
show_scores(lgbm, X_train, y_train)

print('Test error: ')
show_scores(lgbm, X_valid, y_valid)

## LGBM with log transformed target and Mean objective function

In [None]:
y_log = np.log(y[y>0])

plt.hist(y_log, bins = 50)
plt.show()

In [None]:
lgbm = LGBMRegressor(random_state = 2020)

## LGBM with log transformed target and Median objective function

In [None]:
lgbm = LGBMRegressor(objective = 'mae', random_state = 2020)