## Importing Data

Data is being loaded from Notepad

In [74]:
import pandas as pd
df = pd.read_csv('codetest_train.txt', header = 0, sep ='\t')

In [75]:
df.head()

Unnamed: 0,target,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,...,f_244,f_245,f_246,f_247,f_248,f_249,f_250,f_251,f_252,f_253
0,3.066056,-0.653,0.255,-0.615,-1.833,-0.736,,1.115,-0.171,-0.351,...,-1.607,-1.4,-0.92,-0.198,-0.945,-0.573,0.17,-0.418,-1.244,-0.503
1,-1.910473,1.179,-0.093,-0.556,0.811,-0.468,-0.005,-0.116,-1.243,1.985,...,1.282,0.032,-0.061,,-0.061,-0.302,1.281,-0.85,0.821,-0.26
2,7.830711,0.181,-0.778,-0.919,0.113,0.887,-0.762,1.872,-1.709,0.135,...,-0.237,-0.66,1.073,-0.193,0.57,-0.267,1.435,1.332,-1.147,2.58
3,-2.180862,0.745,-0.245,-1.343,1.163,-0.169,-0.151,-1.1,0.225,1.223,...,0.709,-0.203,-0.136,-0.571,1.682,0.243,-0.381,0.613,1.033,0.4
4,5.462784,1.217,-1.324,-0.958,0.448,-2.873,-0.856,0.603,0.763,0.02,...,0.892,-0.433,-0.877,0.289,0.654,1.23,0.457,-0.754,-0.025,-0.931


## Looking at data and deciding on pre-processing

In [76]:
df.isnull().any().sum()

254

Apart from the target variable all the columns have missing values

In [77]:
categorical_variables =[]
numerical_variables = []
for features in df.columns:
    if df[features].dtype.name == 'float64':
        numerical_variables.append(features)
    else :
        categorical_variables.append(features)
print 'No. of categorical varaibles: %s' %len(categorical_variables)
print 'No. of numerical varaibles: %s' %len(numerical_variables)

No. of categorical varaibles: 4
No. of numerical varaibles: 251


Since all the variables are missing , we start with imputation of numerical variables with mean (as the numbers are contnuous) and categorical variables with their mode.

In [78]:
for features in numerical_variables:
    df[features]= df[features].fillna(df[features].mean(axis=0))
for features in categorical_variables:
    df[features] = df[features].fillna(df[features].mode().iloc[0])

In [79]:
df.isnull().any().sum()

0

In [80]:
df[categorical_variables].mode()

Unnamed: 0,f_61,f_121,f_215,f_237
0,d,B,red,Canada


In [81]:
y= df['target']
df_new= df.drop(['target'],axis=1)
numerical_variables.remove('target')

In [82]:
def unfold_categorical(df1,column,df2):
    i=1
    for values in df1[column].unique():
        string = str(column) + '_' + str(i)
        df2[string]=df1[column].apply(lambda x:+1 if x==values else 0)
        i+=1

def normalise_numerical(df, column):
    df[column]=df[column]/float((df[column].max() - df[column].min()))

In [83]:
df_unfolded = df_new[numerical_variables]
for features in numerical_variables:
    normalise_numerical(df_unfolded,features)
for features in categorical_variables:
    unfold_categorical(df_new, features, df_unfolded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [84]:
df_unfolded[['f_215_2', 'f_215_1']]
# What do this values mean? My guess is that it was assumed to encode categorical values, e.g [a, b, c] -> [0, 1, 2] 
# and replace categorical values with numeric ones through 'unfold_categorical' function, but it was implemented wrong. 

Unnamed: 0,f_215_2,f_215_1
0,0,1
1,1,0
2,0,0
3,1,0
4,0,0
5,1,0
6,0,0
7,0,1
8,0,0
9,1,0


 14 more features added due to unfolding of categorical_vaiables 
 ## Comment
 What does unfolding mean here? We add extra features here which may have huge effect (and probably the do) 

In [86]:
df_unfolded.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_121_4,f_121_5,f_121_6,f_215_1,f_215_2,f_215_3,f_215_4,f_237_1,f_237_2,f_237_3
0,-0.08402,0.032513,-0.087644,-0.218318,-0.113126,0.000667,0.151804,-0.022242,-0.043592,-0.014739,...,0,0,0,1,0,0,0,1,0,0
1,0.151698,-0.011858,-0.079236,0.096594,-0.071934,-0.000599,-0.015793,-0.161681,0.246523,-0.116431,...,0,0,0,0,1,0,0,1,0,0
2,0.023289,-0.099197,-0.130968,0.013459,0.136336,-0.09129,0.254867,-0.222294,0.016766,0.124657,...,0,0,0,0,0,1,0,1,0,0
3,0.095857,-0.031238,-0.191392,0.138518,-0.025976,-0.01809,-0.149762,0.029266,0.151888,0.071527,...,1,0,0,0,1,0,0,0,1,0
4,0.156588,-0.168813,-0.136526,0.053359,-0.441592,-0.102552,0.082097,0.099246,0.002484,-0.034049,...,0,1,0,0,0,1,0,1,0,0


From above, note that new features are not normalized. We should call unfold_categorical before norlamize..

In [87]:
df_unfolded['f_215_1']

0       1
1       0
2       0
3       0
4       0
5       0
6       0
7       1
8       0
9       0
10      0
11      1
12      1
13      1
14      0
15      0
16      0
17      0
18      0
19      1
20      0
21      0
22      1
23      0
24      1
25      1
26      0
27      0
28      0
29      1
       ..
4970    0
4971    0
4972    1
4973    1
4974    0
4975    0
4976    0
4977    0
4978    0
4979    1
4980    0
4981    0
4982    0
4983    0
4984    0
4985    1
4986    0
4987    1
4988    0
4989    0
4990    0
4991    0
4992    1
4993    0
4994    0
4995    0
4996    1
4997    0
4998    0
4999    1
Name: f_215_1, dtype: int64

## Training the dataset

In [88]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_unfolded, y, 
                                                    random_state = 0,train_size = 0.8)

In [89]:
import math
def get_root_mean_squared_error(model, data, outcome):
    predictions = model.predict(data)
    error = outcome - predictions
    RSS = (error*error).sum()
    MSE = RSS/float(len(data))
    RMSE = math.sqrt(MSE)
    return(RMSE)

### Linear Regression 

In [90]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

n_folds =20
cv = KFold(n=X_train.shape[0], n_folds=n_folds)
linear_regression = LinearRegression()
X_train_array = X_train.as_matrix()
Y_train_array = Y_train.as_matrix()
cross_val_scores = 0

for train, test in cv:
    linear_regression.fit(X_train_array[train], Y_train_array[train])
    cross_val_scores += get_root_mean_squared_error(linear_regression, X_train_array[test], Y_train_array[test])
cross_val_scores /= n_folds


print 'Cross_Vaildation Error is : %s' %(cross_val_scores)
print 'Train Error is : %s' %(get_root_mean_squared_error(linear_regression, X_train, Y_train))
print 'Test Error is : %s' %(get_root_mean_squared_error(linear_regression, X_test, Y_test)) 

Cross_Vaildation Error is : 3.50609997532
Train Error is : 3.28455531852
Test Error is : 3.5292616866


### Ridge Regression

In [91]:
from sklearn.linear_model import Ridge

alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
tols = [1e-1, 1e-3, 1e-5]
least_error = 10000000

for alpha in alphas:
    for tol in tols:
        model = Ridge (alpha = alpha , tol=tol)
        model.fit(X_train, Y_train)
        test_error = get_root_mean_squared_error(model, X_test, Y_test)
        if test_error < least_error:
            best_model = model
            least_error = test_error

print best_model
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test, Y_test))

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.1)
Train_error : 3.27356310527 | Test_Error: 3.49754521657


#### K-Fold Cross Validation for Ridge

In [15]:
from sklearn.linear_model import Ridge

n_folds =20
alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
tols = [1e-3, 1e-5]
least_error = 10000000
cv = KFold(n=X_train.shape[0], n_folds= n_folds)
X_train_array = X_train.as_matrix()
Y_train_array = Y_train.as_matrix()
cross_val_scores = 0

for train, test in cv:
    for alpha in alphas:
        for tol in tols:
            model = Ridge (alpha = alpha , tol=tol)
            model.fit(X_train_array[train], Y_train_array[train])
            test_error = get_root_mean_squared_error(model, X_train_array[test], Y_train_array[test])
            cross_val_scores += test_error
            if test_error < least_error:
                best_model = model
                least_error = test_error
cross_val_scores /= n_folds         
                
print best_model
print 'Overall Cross_Vaildation Score : %s' %(cross_val_scores)
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test, Y_test))

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Overall Cross_Vaildation Score : 41.132111487
Train_error : 3.27796803156 | Test_Error: 3.49506542261


### Lasso Regression

In [16]:
from sklearn.linear_model import Lasso

alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
tols = [1e-3, 1e-5]
least_error = 10000000

for alpha in alphas:
    for tol in tols:
        model = Lasso (alpha = alpha , tol=tol)
        model.fit(X_train, Y_train)
        test_error = get_root_mean_squared_error(model, X_test, Y_test)
        if test_error < least_error:
            best_model = model
            least_error = test_error
            

print best_model
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test, Y_test))

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.001, warm_start=False)
Train_error : 3.27500627921 | Test_Error: 3.47452934824




#### KFold Classification with Lasso

In [17]:
from sklearn.linear_model import Lasso

n_folds = 20
alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
tols = [1e-3, 1e-5]
least_error = 10000000
cv = KFold(n=X_train.shape[0], n_folds= n_folds)
X_train_array = X_train.as_matrix()
Y_train_array = Y_train.as_matrix()
cross_val_scores = 0

for train, test in cv:
    for alpha in alphas:
        for tol in tols:
            model = Lasso (alpha = alpha , tol=tol)
            model.fit(X_train_array[train], Y_train_array[train])
            test_error = get_root_mean_squared_error(model, X_train_array[test], Y_train_array[test])
            cross_val_scores += test_error
            if test_error < least_error:
                best_model = model
                least_error = test_error
cross_val_scores /= n_folds         
                
print best_model
print 'Overall Cross_Vaildation Score : %s' %(cross_val_scores)
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test, Y_test))

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=1e-05, warm_start=False)
Overall Cross_Vaildation Score : 45.3804886321
Train_error : 3.27797752879 | Test_Error: 3.47177061053


In [70]:
import cPickle
cPickle.dump(best_model, open('Best_Model.pkl','wb'))

### Day 2 : PCA & Regression combined

In [18]:
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA

alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
n_components = [120, 180, 240, len(X_train.columns)]
least_error = 10000000

for alpha in alphas:
    for component  in n_components:
        pca = PCA(n_components=component)
        pca = pca.fit(X_train)
        X_train_new = pca.transform(X_train)
        X_test_new = pca.transform(X_test)
        model = Ridge (alpha = alpha)
        model.fit(X_train_new, Y_train)
        test_error = get_root_mean_squared_error(model, X_test_new, Y_test)
        if test_error < least_error:
            least_component = component
            best_model = model
            least_error = test_error
            
print 'No of components in the best model: %s'%(least_component)
print best_model
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train_new, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test_new, Y_test))

No of components in the best model: 268
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Train_error : 3.27356310527 | Test_Error: 3.49754521657


#### PCA + Ridge-Regression + K-Fold Validation

In [19]:
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA

n_folds = 20
alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
n_components = [120, 180, 240, len(X_train.columns)]
least_error = 10000000
cv = KFold(n=X_train.shape[0], n_folds= n_folds)
X_train_array = X_train.as_matrix()
Y_train_array = Y_train.as_matrix()
cross_val_scores = 0


for train, test in cv:
    for alpha in alphas:
        for components  in n_components:
            pca = PCA(n_components=components)
            pca = pca.fit(X_train_array[train])
            X_train_new = pca.transform(X_train_array[train])
            X_test_new = pca.transform(X_train_array[test])
            model = Ridge(alpha = alpha)
            model.fit(X_train_new, Y_train_array[train])
            test_error = get_root_mean_squared_error(model, X_test_new, Y_train_array[test])
            cross_val_scores += test_error
            if test_error < least_error:
                least_component = component
                best_model = model
                least_error = test_error
                best_pca = pca
cross_val_scores /= n_folds
            
print 'No of components in the best model: %s'%(least_component)
print best_model
print 'Overall Cross_Validation Score: %s' %(cross_val_scores)
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, best_pca.transform(X_train), Y_train), 
                                            get_root_mean_squared_error(best_model, best_pca.transform(X_test), Y_test))

No of components in the best model: 268
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Overall Cross_Validation Score: 84.4503292113
Train_error : 3.27796803156 | Test_Error: 3.49506542261


#### PCA + Lasso

In [21]:
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA

alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
n_components = [120, 180, 240, len(X_train.columns)]
least_error = 10000000

models_PCA_Lasso =[]
for alpha in alphas:
    for components  in n_components:
        pca = PCA(n_components=components)
        pca = pca.fit(X_train)
        X_train_new = pca.transform(X_train)
        X_test_new = pca.transform(X_test)
        model = Lasso(alpha = alpha)
        model.fit(X_train_new, Y_train)
        test_error = get_root_mean_squared_error(model, X_test_new, Y_test)
        if test_error < least_error:
            least_component = component
            best_model = model
            least_error = test_error
            
print 'No of components in the best model: %s'%(least_component)
print best_model
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train_new, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test_new, Y_test))

No of components in the best model: 268
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
Train_error : 3.27512492286 | Test_Error: 3.48935788896


#### PCA + Lasso + K-Fold

In [22]:
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA

n_folds = 20
alphas = [1e-6 , 1e-3, 1e0, 1e3, 1e6]
n_components = [120, 180, 240, len(X_train.columns)]
least_error = 10000000
cv = KFold(n=X_train.shape[0], n_folds= n_folds)
X_train_array = X_train.as_matrix()
Y_train_array = Y_train.as_matrix()
cross_val_scores = 0


for train, test in cv:
    for alpha in alphas:
        for components  in n_components:
            pca = PCA(n_components=components)
            pca = pca.fit(X_train_array[train])
            X_train_new = pca.transform(X_train_array[train])
            X_test_new = pca.transform(X_train_array[test])
            model = Lasso(alpha = alpha)
            model.fit(X_train_new, Y_train_array[train])
            test_error = get_root_mean_squared_error(model, X_test_new, Y_train_array[test])
            cross_val_scores += test_error
            if test_error < least_error:
                least_component = component
                best_model = model
                least_error = test_error
                best_pca = pca
cross_val_scores /= n_folds
            
print 'No of components in the best model: %s'%(least_component)
print best_model
print 'Overall Cross_Validation Score: %s' %(cross_val_scores)
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, best_pca.transform(X_train), Y_train), 
                                            get_root_mean_squared_error(best_model, best_pca.transform(X_test), Y_test))

No of components in the best model: 268
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
Overall Cross_Validation Score: 92.2807306963
Train_error : 3.27848676588 | Test_Error: 3.48700479959


### Decision Tree

In [24]:
from sklearn.tree import DecisionTreeRegressor

depths =[10, 20, 50, 100]
min_samples_splits =[50, 100, 200, 1000]
least_error =100000000

for depth in depths:
    for min_samples_split in min_samples_splits:
        model = DecisionTreeRegressor (criterion = 'mse', max_depth = depth , min_samples_split=min_samples_split)
        model.fit(X_train, Y_train)
        test_error = get_root_mean_squared_error(model, X_test, Y_test)
        if test_error < least_error:
            best_model = model
            least_error = test_error
            

print best_model
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test, Y_test))

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=200,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
Train_error : 3.5077742089 | Test_Error: 3.9443202025


In [26]:
from sklearn.tree import DecisionTreeRegressor

n_folds = 20
depths =[10, 20, 50, 100]
min_samples_splits =[50, 100, 200, 1000]
least_error =100000000
cv = KFold(n=X_train.shape[0], n_folds= n_folds)
X_train_array = X_train.as_matrix()
Y_train_array = Y_train.as_matrix()
cross_val_scores = 0


for train, test in cv:
    for depth in depths:
        for min_samples_split in min_samples_splits:
            model = DecisionTreeRegressor (criterion = 'mse', max_depth = depth , min_samples_split=min_samples_split)
            model.fit(X_train_array[train], Y_train_array[train])
            test_error = get_root_mean_squared_error(model, X_train_array[test], Y_train_array[test])
            cross_val_scores += test_error
            if test_error < least_error:
                best_model = model
                least_error = test_error
cross_val_scores /= n_folds            

print best_model
print 'Overall_CV_Score:%s' %cross_val_scores
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, X_train, Y_train), 
                                            get_root_mean_squared_error(best_model, X_test, Y_test))

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=200,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
Overall_CV_Score:70.3052929645
Train_error : 3.46488984255 | Test_Error: 4.01759501365


In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

depths =[10, 20, 50, 100]
min_samples_splits =[50, 100, 200, 1000]
n_components = [120, 180, 240, len(X_train.columns)]
least_error = 10000000


for components  in n_components:
    for min_sample_split in min_samples_splits:
        for depth in depths:
            pca = PCA(n_components=components)
            pca = pca.fit(X_train)
            X_train_new = pca.transform(X_train)
            X_test_new = pca.transform(X_test)
            model = DecisionTreeRegressor (criterion = 'mse', max_depth = depth , min_samples_split=min_samples_split)
            model.fit(X_train_new, Y_train)
            test_error = get_root_mean_squared_error(model, X_test_new, Y_test)
            if test_error < least_error:
                least_component = component
                best_model = model
                least_error = test_error
                best_pca =pca
            
print 'No of components in the best model: %s'%(least_component)
print best_model
print 'Train_error : %s | Test_Error: %s' %(get_root_mean_squared_error(best_model, best_pca.transform(X_train), Y_train), 
                                            get_root_mean_squared_error(best_model, best_pca.transform(X_test), Y_test))

No of components in the best model: 268
DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=1000,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
Train_error : 4.39615326061 | Test_Error: 4.51933083642
