In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


### Understanding the Data

In [2]:
import pandas as pd
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
print (train_data.columns)

#splitting the training data into Y (predicted variable) and X 
#(Independent variable - predictor of X)

X = train_data.drop(['Survived'], axis = 1)
Y = train_data['Survived']

print ("Independent variables(X): ", X[:4])
print ("To be predicted variables(Y): ", Y[:4])


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Independent variables(X):     PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
To be predicted variables(Y):  0    0
1    1
2    1
3    1
Name:

### Feature Selection

-- <font color = 'green'>Finding relevant features in the dataset X which will influence the Y value </font>

In [3]:

#the dtypes gives the data types of each variable
print (X.dtypes)
# from the above we get name,sex, ticket, cabin and embarked as object data types.
#It is clear that name does not need to be one hot encoded as it doesn't result in any
#useful prediction. 
#To confirm embarked, cabin and sex lets perform a group by search to confirm if they 
#can be one hot encoded.

# doing a distinct search in categorical features for one hot encoding.

cabin_count = pd.unique(X['Cabin'])
print (X["Cabin"].unique())
#print (len(cabin_count))
#print (len(X))
print (X["Sex"].unique())

print (X["Embarked"].unique())

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B

### One hot encoding

--<font color = 'green'> based on the above code the features Cabin, Sex and Embarked are categorical features, so lets convert them to numerical data with one hot encoding. </font>

In [4]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

onehot_encode = OneHotEncoder()
print (onehot_encode)

#one hot encoding the categorical values
feature_array = onehot_encode.fit_transform(X[["Cabin","Sex","Embarked"]]).toarray()

#labelling the columns with the categorical features
feature_labels = onehot_encode.categories_
#now we have to flaten the list since feature_labels has nested list in it
flattened_label = []
for sublist in feature_labels:
    for items in sublist:
        flattened_label.append(items)

print (flattened_label)
#np.array(feature_labels).ravel()

OneHotEncoder()
['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A24', 'A26', 'A31', 'A32', 'A34', 'A36', 'A5', 'A6', 'A7', 'B101', 'B102', 'B18', 'B19', 'B20', 'B22', 'B28', 'B3', 'B30', 'B35', 'B37', 'B38', 'B39', 'B4', 'B41', 'B42', 'B49', 'B5', 'B50', 'B51 B53 B55', 'B57 B59 B63 B66', 'B58 B60', 'B69', 'B71', 'B73', 'B77', 'B78', 'B79', 'B80', 'B82 B84', 'B86', 'B94', 'B96 B98', 'C101', 'C103', 'C104', 'C106', 'C110', 'C111', 'C118', 'C123', 'C124', 'C125', 'C126', 'C128', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C30', 'C32', 'C45', 'C46', 'C47', 'C49', 'C50', 'C52', 'C54', 'C62 C64', 'C65', 'C68', 'C7', 'C70', 'C78', 'C82', 'C83', 'C85', 'C86', 'C87', 'C90', 'C91', 'C92', 'C93', 'C95', 'C99', 'D', 'D10 D12', 'D11', 'D15', 'D17', 'D19', 'D20', 'D21', 'D26', 'D28', 'D30', 'D33', 'D35', 'D36', 'D37', 'D45', 'D46', 'D47', 'D48', 'D49', 'D50', 'D56', 'D6', 'D7', 'D9', 'E10', 'E101', 'E12', 'E121', 'E17', 'E24', 'E25', 'E31', 'E33', 'E34', 'E36', 'E38', 'E40', 'E44', 'E46', 'E49', 'E50', 'E5

In [5]:
print(X.shape)
#converting the one hot encoded feature array to dataframe and 
#assigning feature label as the column name
features = pd.DataFrame(feature_array, columns = flattened_label)
features.head()


(891, 11)


Unnamed: 0,A10,A14,A16,A19,A20,A23,A24,A26,A31,A32,...,F4,G6,T,NaN,female,male,C,Q,S,NaN.1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [6]:
#concatenating the new one hot encoded features to training data set
X_new = pd.concat([X, features], axis = 1)
print (X_new.columns)
print (X_new.shape)

Index(['PassengerId',      'Pclass',        'Name',         'Sex',
               'Age',       'SibSp',       'Parch',      'Ticket',
              'Fare',       'Cabin',
       ...
                'F4',          'G6',           'T',           nan,
            'female',        'male',           'C',           'Q',
                 'S',           nan],
      dtype='object', length=165)
(891, 165)


In [7]:
print(X.columns)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
