In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

#for feature selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

pd.pandas.set_option('display.max_columns',None)

In [2]:
dataset = pd.read_csv('processed_train.csv')
dataset.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Age_na,Fare_na,Sex_female,Sex_male,Embarked_C,Embarked_Missing,Embarked_Q,Embarked_S
0,0,1.0,0.287881,0.125,0.0,0.162639,1.921053,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,0.0,0.498879,0.125,0.0,0.612131,1.065789,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,1.0,0.34063,0.0,0.0,0.180146,1.921053,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,0.0,0.459317,0.125,0.0,0.554219,0.723684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0,1.0,0.459317,0.0,0.0,0.183223,1.921053,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [3]:
#Capture the dependent feature
y_train = pd.DataFrame(dataset['Survived'])
y_train.columns = ['Survived']
#Capture the independent features
X_train = dataset.drop(['Survived'],axis = 1 )

In [4]:
X_train.shape , y_train.shape

((891, 14), (891, 1))

In [5]:
#Apply Feature Selection
#first , lasso regression model 
#select a suitable alpha
#the bigger the alpha the less features that will be selected

#then I use the selectFromModel object from sklearn which,
#will select the features in which coefficients are non zero

feature_sel_model = SelectFromModel(Lasso(alpha = 0.005 , random_state = 0))
feature_sel_model.fit(X_train , y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [6]:
feature_sel_model.get_support()

array([ True,  True,  True, False, False,  True,  True, False,  True,
       False, False, False, False,  True])

In [7]:
#Let's print the number of total and selected features

selected_feat = X_train.columns[(feature_sel_model.get_support())]


print('Total Features: {}'.format((X_train.shape[1])))
print('Selected Features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(X_train.shape[1] - len(selected_feat)))

Total Features: 14
Selected Features: 7
features with coefficients shrank to zero: 7


In [8]:
selected_feat

Index(['Pclass', 'Age', 'SibSp', 'Cabin', 'Age_na', 'Sex_female',
       'Embarked_S'],
      dtype='object')

In [9]:
X_train = X_train[selected_feat]
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Cabin,Age_na,Sex_female,Embarked_S
0,1.0,0.287881,0.125,1.921053,0.0,0.0,1.0
1,0.0,0.498879,0.125,1.065789,0.0,1.0,0.0
2,1.0,0.34063,0.0,1.921053,0.0,1.0,1.0
3,0.0,0.459317,0.125,0.723684,0.0,1.0,1.0
4,1.0,0.459317,0.0,1.921053,0.0,0.0,1.0


In [10]:
#X_train.to_csv('lasso_xtrain.csv',index = False)
#y_train.to_csv('lasso_ytrain.csv' , index = False)

In [10]:
final_data = pd.concat([X_train.reset_index(drop = True),y_train.reset_index(drop = True)], axis = 1)
final_data.head()

Unnamed: 0,Pclass,Age,SibSp,Cabin,Age_na,Sex_female,Embarked_S,Survived
0,1.0,0.287881,0.125,1.921053,0.0,0.0,1.0,0
1,0.0,0.498879,0.125,1.065789,0.0,1.0,0.0,1
2,1.0,0.34063,0.0,1.921053,0.0,1.0,1.0,1
3,0.0,0.459317,0.125,0.723684,0.0,1.0,1.0,1
4,1.0,0.459317,0.0,1.921053,0.0,0.0,1.0,0


In [11]:
final_data.to_csv('lasso_dataset_train.csv', index = False)