# Loading the data
Make sure the CSV files provided by Zindi are in the same folder as this notebook. If it's running on Google Colab, you can use the 'files' tab on the left to upload them. We load the training data from Train.csv, and print out the variable descriptions here for easy reference. 

In [43]:
import pandas as pd
import numpy as np
import os, random, math, glob
from IPython.display import Image as IM
from IPython.display import clear_output
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 10]
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox

from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [13]:
df = pd.read_csv('data/Train.csv')
df.head()

Unnamed: 0,ward,total_households,total_individuals,target,dw_00,dw_01,dw_02,dw_03,dw_04,dw_05,...,pw_03,pw_04,pw_05,pw_06,pw_07,pw_08,ADM4_PCODE,lat,lon,NL
0,41601001: Ward 1,1674.45058,5888.2075,16.773757,0.933841,0.000846,0.00549,0.000676,0.0,0.001372,...,0.002848,0.007537,0.0,0.012928,0,0,ZA4161001,-29.68227,24.734743,0.292039
1,41601002: Ward 2,1736.9923,6735.33812,21.496661,0.69694,0.001253,0.004402,0.0,0.002301,0.001323,...,0.014566,0.057127,0.019092,0.004131,0,0,ZA4161002,-29.119311,24.757737,3.207775
2,41601003: Ward 3,2403.57591,7273.04995,10.931425,0.810545,0.004517,0.008891,0.003986,0.007735,0.000956,...,0.05756,0.010358,0.001421,0.040881,0,0,ZA4161003,-29.142276,25.094093,0.0
3,41601004: Ward 4,1740.78737,5734.49046,23.119257,0.659914,0.0,0.006129,0.0,0.000813,0.037245,...,0.0,0.000669,0.0,0.005011,0,0,ZA4161004,-29.372052,24.942867,2.038778
4,41601005: Ward 5,1730.51451,6657.23835,13.652252,0.950575,0.000655,0.001473,0.000598,0.006999,0.000818,...,0.004859,0.00129,0.000673,0.017629,0,0,ZA4161005,-29.409381,25.290165,0.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822 entries, 0 to 2821
Data columns (total 63 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ward               2822 non-null   object 
 1   total_households   2822 non-null   float64
 2   total_individuals  2822 non-null   float64
 3   target             2822 non-null   float64
 4   dw_00              2822 non-null   float64
 5   dw_01              2822 non-null   float64
 6   dw_02              2822 non-null   float64
 7   dw_03              2822 non-null   float64
 8   dw_04              2822 non-null   float64
 9   dw_05              2822 non-null   float64
 10  dw_06              2822 non-null   float64
 11  dw_07              2822 non-null   float64
 12  dw_08              2822 non-null   float64
 13  dw_09              2822 non-null   float64
 14  dw_10              2822 non-null   float64
 15  dw_11              2822 non-null   float64
 16  dw_12              2822 

In [15]:
df.describe()

Unnamed: 0,total_households,total_individuals,target,dw_00,dw_01,dw_02,dw_03,dw_04,dw_05,dw_06,...,pw_02,pw_03,pw_04,pw_05,pw_06,pw_07,pw_08,lat,lon,NL
count,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,...,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0,2822.0
mean,3665.281214,12869.130053,24.507554,0.712196,0.092616,0.032043,0.006057,0.008665,0.006289,0.022375,...,0.127555,0.041589,0.019655,0.011008,0.110818,0.0,0.0,-26.88074,28.666515,17.43756
std,3266.364522,9696.690518,10.294387,0.214035,0.182852,0.080253,0.019374,0.030697,0.024606,0.037127,...,0.1564,0.057331,0.032926,0.023672,0.185401,0.0,0.0,2.021279,2.373809,18.958621
min,1.0,402.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-32.490089,16.760022,0.0
25%,1778.858235,7071.205695,16.751556,0.594212,0.002895,0.002407,0.0,0.0,0.0,0.002716,...,0.008673,0.002099,0.000715,0.000159,0.005217,0.0,0.0,-28.569019,27.707932,3.033397
50%,2398.249935,9366.98968,24.15667,0.766841,0.010425,0.005762,0.000807,0.000607,0.000865,0.008639,...,0.069065,0.016496,0.005164,0.001459,0.025165,0.0,0.0,-26.549866,28.959679,9.205572
75%,3987.080563,14241.233015,32.226553,0.881708,0.068209,0.027913,0.002538,0.002225,0.003027,0.025218,...,0.183384,0.058626,0.025055,0.009432,0.116638,0.0,0.0,-25.57213,30.441505,26.890531
max,39684.94213,91716.74637,55.528423,0.994962,0.93149,0.951806,0.264239,0.392085,0.435912,0.412936,...,1.0,0.327393,0.306787,0.228261,0.961523,0.0,0.0,-22.331267,32.858249,63.0


['dw_13', 'lan_13', 'dw_12', 'pw_08', 'pw_07'] have only 0 in them --> we could drop these features

In [16]:
# drop features with only 0 in them
df = df.drop(['dw_13', 'lan_13', 'dw_12', 'pw_08', 'pw_07', 'lat', 'lon', 'ward', 'ADM4_PCODE'], axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822 entries, 0 to 2821
Data columns (total 54 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   total_households   2822 non-null   float64
 1   total_individuals  2822 non-null   float64
 2   target             2822 non-null   float64
 3   dw_00              2822 non-null   float64
 4   dw_01              2822 non-null   float64
 5   dw_02              2822 non-null   float64
 6   dw_03              2822 non-null   float64
 7   dw_04              2822 non-null   float64
 8   dw_05              2822 non-null   float64
 9   dw_06              2822 non-null   float64
 10  dw_07              2822 non-null   float64
 11  dw_08              2822 non-null   float64
 12  dw_09              2822 non-null   float64
 13  dw_10              2822 non-null   float64
 14  dw_11              2822 non-null   float64
 15  psa_00             2822 non-null   float64
 16  psa_01             2822 

In [18]:
df.shape

(2822, 54)

In [19]:
# transform the data
def convert_zeros(x):
    '''
    function to convert zeros to a postive number 
    so that it can be transformed with the boxcox'''
    if x == 0.0:
        return 0.0000001
    else :
        return x


In [20]:
reduced_feature_list = ['pw_00', 'psa_00', 'car_00', 'pg_03', 'NL', 'lan_00', 'pg_00', 'target']
df_red = df[reduced_feature_list]

In [21]:


for col in reduced_feature_list:
    if col != 'target' and  col != 'psa_00' and col != 'car_00' and col != 'pg_00':
        df_red[col] = df_red[col].apply(convert_zeros)
        df_red[col] = boxcox(df_red[col])[0].reshape(-1,1);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_red[col] = df_red[col].apply(convert_zeros)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_red[col] = boxcox(df_red[col])[0].reshape(-1,1);


In [22]:
df_red.head()

Unnamed: 0,pw_00,psa_00,car_00,pg_03,NL,lan_00,pg_00,target
0,-0.284113,0.260191,0.273727,-2.468668,-1.053425,-0.183226,0.357455,16.773757
1,-0.982978,0.290228,0.144638,-3.780784,1.361363,-0.128422,0.698428,21.496661
2,-0.951828,0.186435,0.27222,-1.716657,-3.789711,-0.573863,0.672452,10.931425
3,-0.366754,0.281229,0.127875,-5.967212,0.782549,-0.297017,0.728014,23.119257
4,-0.249689,0.196687,0.404507,-2.1263,-3.789711,-0.8702,0.753491,13.652252


In [23]:
# Train test split with same random seed
# Defining X and y
features = df_red.columns.tolist()
features.remove('target')
X = df[features]
y = df.target

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=150, shuffle=True)

# Check the shape of the data sets
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (2116, 7)
y_train: (2116,)
X_test: (706, 7)
y_test: (706,)


## Model Ridge and Lasso

Ridge Regularization 

In [96]:
ridge = Ridge(max_iter=10000)
ridge.fit(X_train, y_train)
y_pred_ridge  = ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
print('rmse using ridge :', rmse_ridge)

rmse using ridge : 4.414364106329006


In [42]:
ridge.coef_

array([-6.79966475e+00,  6.44222705e+01, -2.29101213e+01,  6.83704725e+00,
       -4.91712518e-02, -9.68590111e+00, -7.34113102e+00])

Randomized Grid Search with Ridge

In [98]:
param_rand_grid = {'alpha' : [int(x) for x in np.linspace(start = .0001, stop = 10, num = 1000)]}

g_search = RandomizedSearchCV(estimator = ridge, param_distributions = param_rand_grid, cv = 5, n_jobs = 1, verbose = 5)

g_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...........................alpha=4;, score=0.796 total time=   0.0s
[CV 2/5] END ...........................alpha=4;, score=0.781 total time=   0.0s
[CV 3/5] END ...........................alpha=4;, score=0.780 total time=   0.0s
[CV 4/5] END ...........................alpha=4;, score=0.796 total time=   0.0s
[CV 5/5] END ...........................alpha=4;, score=0.787 total time=   0.0s
[CV 1/5] END ...........................alpha=4;, score=0.796 total time=   0.0s
[CV 2/5] END ...........................alpha=4;, score=0.781 total time=   0.0s
[CV 3/5] END ...........................alpha=4;, score=0.780 total time=   0.0s
[CV 4/5] END ...........................alpha=4;, score=0.796 total time=   0.0s
[CV 5/5] END ...........................alpha=4;, score=0.787 total time=   0.0s
[CV 1/5] END ...........................alpha=6;, score=0.778 total time=   0.0s
[CV 2/5] END ...........................alpha=6;

RandomizedSearchCV(cv=5, estimator=Ridge(max_iter=10000), n_jobs=1,
                   param_distributions={'alpha': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]},
                   verbose=5)

In [99]:
g_search.best_params_

{'alpha': 0}

Grid Search 

In [104]:
param_grid = {'alpha' : [0.01, 0.1,0,1, 10]}

g_search = GridSearchCV(estimator = ridge, param_grid = param_grid, cv = 5, n_jobs = 1, verbose = 5)

g_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ........................alpha=0.01;, score=0.838 total time=   0.0s
[CV 2/5] END ........................alpha=0.01;, score=0.822 total time=   0.0s
[CV 3/5] END ........................alpha=0.01;, score=0.814 total time=   0.0s
[CV 4/5] END ........................alpha=0.01;, score=0.832 total time=   0.0s
[CV 5/5] END ........................alpha=0.01;, score=0.800 total time=   0.0s
[CV 1/5] END .........................alpha=0.1;, score=0.837 total time=   0.0s
[CV 2/5] END .........................alpha=0.1;, score=0.822 total time=   0.0s
[CV 3/5] END .........................alpha=0.1;, score=0.813 total time=   0.0s
[CV 4/5] END .........................alpha=0.1;, score=0.831 total time=   0.0s
[CV 5/5] END .........................alpha=0.1;, score=0.801 total time=   0.0s
[CV 1/5] END ...........................alpha=0;, score=0.838 total time=   0.0s
[CV 2/5] END ...........................alpha=0;,

GridSearchCV(cv=5, estimator=Ridge(max_iter=10000), n_jobs=1,
             param_grid={'alpha': [0.01, 0.1, 0, 1, 10]}, verbose=5)

In [106]:
print(g_search.best_params_)
print(g_search.best_score_)

{'alpha': 0.01}
0.821117095050587


In [115]:
ridge = Ridge(max_iter=10000, alpha=0.0)
ridge.fit(X_train, y_train)
y_pred_ridge  = ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
print('rmse using ridge :', rmse_ridge)

rmse using ridge : 4.395993583393601


Lasso Regularization

In [32]:
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred_lasso  = lasso.predict(X_test)

In [33]:
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso= np.sqrt(mse_lasso)
print('rmse using lasso :', rmse_lasso)

rmse using lasso : 7.5201689472058355


In [35]:
lasso.coef_

array([-7.89722019,  0.        , -0.        , -0.        , -0.2077088 ,
       -0.        ,  0.        ])

## Ridge to large number of features

In [132]:
# Train test split with same random seed
# Defining X and y
features_large = df.columns.tolist()
features_large.remove('target')
X2 = df[features_large]
y2 = df.target

# Splitting the dataset
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, random_state=10, shuffle=True)

# Check the shape of the data sets
print("X_train:", X2_train.shape)
print("y_train:", y2_train.shape)
print("X_test:", X2_test.shape)
print("y_test:", y2_test.shape)

X_train: (2116, 53)
y_train: (2116,)
X_test: (706, 53)
y_test: (706,)


In [133]:
ridge2 = Ridge()
ridge2.fit(X2_train, y2_train)
y2_pred_ridge2  = ridge2.predict(X2_test)

mse2_ridge = mean_squared_error(y2_test, y2_pred_ridge2)
rmse2_ridge= np.sqrt(mse2_ridge)
print('rmse large features using ridge :', rmse2_ridge)

rmse large features using ridge : 4.056197818260214


In [148]:
ridge2.coef_

array([ 7.30905811e-05, -6.13550883e-05, -4.64871362e+07, -4.64871317e+07,
       -4.64871419e+07, -4.64871389e+07, -4.64871279e+07, -4.64871354e+07,
       -4.64871475e+07, -4.64871443e+07, -4.64871392e+07, -4.64871568e+07,
       -4.64871233e+07, -4.64871657e+07, -2.00089793e+07, -2.00090368e+07,
       -2.00090303e+07, -2.00090277e+07, -2.00089961e+07, -2.42058236e+10,
       -2.42058236e+10, -1.01772898e+11, -1.01772898e+11, -1.90773601e+10,
       -1.90773601e+10,  1.09267657e+07,  1.09267701e+07,  1.09267709e+07,
        1.09267690e+07,  1.09267688e+07,  1.09267737e+07,  1.09267726e+07,
        1.09267693e+07,  1.09267775e+07,  1.09267697e+07,  1.09267717e+07,
        1.09267745e+07,  1.09267710e+07,  1.09267500e+07, -2.61721515e+07,
       -2.61721489e+07, -2.61721447e+07, -2.61721383e+07, -2.61721538e+07,
        5.25908149e+00, -3.01651464e+07, -3.01651453e+07, -3.01651425e+07,
       -3.01651432e+07, -3.01651395e+07, -3.01651336e+07, -3.01651428e+07,
       -1.20648492e-02])

## Random Grid Search

In [134]:
param_rand_grid = {'alpha' : [int(x) for x in np.linspace(start = .0001, stop = 10, num = 1000)]}

g_search = RandomizedSearchCV(estimator = ridge2, param_distributions = param_rand_grid, cv = 5, n_jobs = 1, verbose = 0)

g_search.fit(X2_train, y2_train)

RandomizedSearchCV(cv=5, estimator=Ridge(), n_jobs=1,
                   param_distributions={'alpha': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]})

In [135]:
print(g_search.best_params_)
print(g_search.best_score_)

{'alpha': 0}
0.8608035446063461


## Grid Search

In [136]:
param_grid = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100]}

g_search = RandomizedSearchCV(estimator = ridge2, param_distributions = param_grid, cv = 5, n_jobs = 1, verbose = 0)

g_search.fit(X2_train, y2_train)



RandomizedSearchCV(cv=5, estimator=Ridge(), n_jobs=1,
                   param_distributions={'alpha': [0.001, 0.01, 0.1, 1, 10,
                                                  100]})

In [137]:
print(g_search.best_params_)
print(g_search.best_score_)

{'alpha': 0.1}
0.8626451015894687


## Check RMSE with the parameter obtained using Grid Search

In [141]:
ridge2 = Ridge(alpha=0.0)
ridge2.fit(X2_train, y2_train)
y2_pred_ridge2  = ridge2.predict(X2_test)

mse2_ridge = mean_squared_error(y2_test, y2_pred_ridge2)
rmse2_ridge= np.sqrt(mse2_ridge)
print('rmse large features using ridge :', rmse2_ridge)

rmse large features using ridge : 4.026699671973028


## Lasso to large number of features

In [139]:
lasso = Lasso()
lasso.fit(X2_train, y2_train)
y2_pred_lasso  = lasso.predict(X2_test)

mse2_lasso = mean_squared_error(y2_test, y2_pred_lasso)
rmse2_lasso= np.sqrt(mse2_lasso)
print('rmse using lasso :', rmse2_lasso)

rmse using lasso : 7.828585313298793


In [140]:
lasso.coef_

array([-2.24902524e-03,  7.51993981e-04, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -5.76172405e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -2.12948366e-01])

In [142]:
param_grid = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100]}

g_search = RandomizedSearchCV(estimator = lasso, param_distributions = param_grid, cv = 5, n_jobs = 1, verbose = 0)

g_search.fit(X2_train, y2_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


RandomizedSearchCV(cv=5, estimator=Lasso(), n_jobs=1,
                   param_distributions={'alpha': [0.001, 0.01, 0.1, 1, 10,
                                                  100]})

In [144]:
print(g_search.best_params_)
print(g_search.best_score_)

{'alpha': 0.001}
0.8625020526157137


In [146]:
lasso2 = Lasso(alpha=0.001)
lasso2.fit(X2_train, y2_train)
y2_pred_lasso2  = lasso2.predict(X2_test)

mse2_lasso2 = mean_squared_error(y2_test, y2_pred_lasso2)
rmse2_lasso2= np.sqrt(mse2_lasso2)
print('rmse using lasso :', rmse2_lasso2)

rmse using lasso : 4.018284256504949


  model = cd_fast.enet_coordinate_descent(


In [147]:
lasso2.coef_

array([ 1.07311915e-05, -3.57498750e-05,  3.16319511e+00,  7.70692701e+00,
       -2.43488956e+00,  0.00000000e+00,  1.04800951e+01,  2.37111418e+00,
       -7.71329237e+00, -4.00507799e+00,  0.00000000e+00, -1.48054300e+01,
        0.00000000e+00, -2.51667165e+01,  4.74262170e+01, -1.12810836e+01,
       -0.00000000e+00, -0.00000000e+00,  1.05609684e+01, -4.32610721e+00,
        5.04497801e-14, -3.52374679e+01,  1.11451660e-12,  1.44786226e+01,
       -4.94505587e-14, -2.95878911e+00,  1.90218900e+00,  4.11599559e-01,
       -7.66183432e-01, -1.12230621e+00,  3.77337449e+00,  2.89124466e+00,
       -6.85411855e-01,  0.00000000e+00, -1.51057736e-01,  1.82150210e+00,
        4.53690867e+00, -0.00000000e+00, -3.37559398e+00, -1.29538165e+00,
       -0.00000000e+00,  3.47391994e+00,  9.98738603e+00, -0.00000000e+00,
        5.08562636e+00, -4.41603394e+00, -2.92998927e+00,  0.00000000e+00,
        0.00000000e+00,  1.73120058e+00,  6.78823507e+00, -1.21473815e-01,
       -1.80147235e-02])