In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


"""
Selva - notes
onehot_encode = OneHotEncoder()
print (onehot_encode)

#one hot encoding the categorical values
feature_array = onehot_encode.fit_transform(X[["Pclass","Sex","SibSp","Parch","Cabin","Embarked"]]).toarray()

#labelling the columns with the categorical features
feature_labels = onehot_encode.categories_
#now we have to flaten the list since feature_labels has nested list in it
flattened_label = []
for sublist in feature_labels:
    for items in sublist:
        flattened_label.append(items)

print (flattened_label)
#np.array(feature_labels).ravel()
"""

### Understanding the Data

In [88]:
import pandas as pd
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
print (train_data.columns)

#Embarked has only 2 NaN values so removing the rows now
train_data = train_data.dropna(subset = ["Embarked"])
#Also dropping Text features since they are not useful in this classification
#X = X.drop(columns = ["PassengerId", "Name", "Ticket"])

#splitting the training data into Y (predicted variable) and X 
#(Independent variable - predictor of X)
X = train_data.drop(['Survived'], axis = 1)
Y = train_data['Survived']

print (X.shape)

#print ("Independent variables(X): ", X[:4])
#print ("To be predicted variables(Y): ", Y[:4])

# Check for NaN values in each column
nan_columns = X.columns[X.isna().any()].tolist()
print (X['Age'].isna().sum())
print (X['Cabin'].isna().sum())
print (X['Embarked'].isna().sum())
print(nan_columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
(889, 11)
177
687
0
['Age', 'Cabin']


### Feature Selection

-- <font color = 'green'>Finding relevant features in the dataset X which will influence the Y value </font>

In [83]:
#the dtypes gives the data types of each variable
print (X.dtypes)
# from the above we get name,sex, ticket, cabin and embarked as object data types.
#It is clear that name does not need to be one hot encoded as it doesn't result in any
#useful prediction. 
#To confirm embarked, cabin and sex lets perform a group by search to confirm if they 
#can be one hot encoded.

# doing a distinct search in categorical features for one hot encoding.

cabin_count = pd.unique(X['Cabin'])
print ("Cabin, ", X["Cabin"].unique())
#print (len(cabin_count))
#print (len(X))
print ("Sex: ", X["Sex"].unique())
print ("Pclass: ", X["Pclass"].unique()) #categorical feature one hot encode pending
print("SibSp: ", X["SibSp"].unique()) #categorical feature one hot encode pending
print ("Parch: ", X["Parch"].unique())


print(X["Embarked"].isna().sum())
print (X["Cabin"].isna().sum())
print (X["Age"].isna().sum())

print(X.shape)

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
Cabin,  [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 

### One hot encoding

--<font color = 'green'> based on the above code the features Cabin, Sex and Embarked are categorical features, so lets convert them to numerical data with one hot encoding. </font>

In [93]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

def data_preprocessing(X):
    #Drop 
    X = X.drop(columns=["PassengerId","Name", "Ticket", "Cabin","Parch"])
    #nan_columns_infun = X.columns[X.isna().any()].tolist()
    #print ("Before removing NAN columns: ",nan_columns_infun)
    
    ## dealing with the Nan values
    age_mean = X["Age"].mean()
    X["Age"] = X["Age"].fillna(age_mean)
    
    ## Step 2: Scale the numerical features to lie between 0 and 1
    # I am using MinMax scaler.
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(X[["Age", "Fare"]])
    scaled_df = pd.DataFrame(scaled_features, columns=["Age", "Fare"])
    X = X.drop(columns=["Age", "Fare"])
    X = pd.concat([X.reset_index(drop=True), scaled_df.reset_index(drop=True)], axis=1)
    #X = pd.concat([X,scaled_df], axis=1)
    
    #nan_columns_infun = X.columns[X.isna().any()].tolist()
    #print ("NAN columns: ",nan_columns_infun)
    
    ##Step 1: One hot encoding categorical features
    onehot_encode = OneHotEncoder(sparse = False, handle_unknown='ignore') #print (onehot_encode)
    
    #feature_array = onehot_encode.fit_transform(X[["Pclass","Sex","SibSp","Parch","Embarked"]]).toarray()
    feature_array = onehot_encode.fit_transform(X[["Pclass","Sex","SibSp","Embarked"]])
    #feature_labels = onehot_encode.categories_ 
    feature_labels = onehot_encode.get_feature_names_out(["Pclass","Sex","SibSp","Embarked"])
    print(feature_labels)
    
    #dropping the encoded features
    X = X.drop(columns = ["Pclass","Sex","SibSp","Embarked"])
    
    #flattened_label = []
    #for sublist in feature_labels:
    #    for items in sublist:
    #        flattened_label.append(items)
    
    #print (flattened_label)
    features = pd.DataFrame(feature_array, columns = feature_labels)
    X = pd.concat([X,features], axis = 1)
    nan_columns_infun = X.columns[X.isna().any()].tolist()
    print (nan_columns_infun)
    print (X.shape)
    return X



In [94]:
X_new = data_preprocessing(X)

['Pclass_1' 'Pclass_2' 'Pclass_3' 'Sex_female' 'Sex_male' 'SibSp_0'
 'SibSp_1' 'SibSp_2' 'SibSp_3' 'SibSp_4' 'SibSp_5' 'SibSp_8' 'Embarked_C'
 'Embarked_Q' 'Embarked_S']
[]
(889, 17)




In [95]:
#print(type(X_new))
#print (type(Y))
print (X_new)
print (X_new.columns)

nan_columns = X.columns[X.isna().any()].tolist()
print (nan_columns)

          Age      Fare  Pclass_1  Pclass_2  Pclass_3  Sex_female  Sex_male  \
0    0.271174  0.014151       0.0       0.0       1.0         0.0       1.0   
1    0.472229  0.139136       1.0       0.0       0.0         1.0       0.0   
2    0.321438  0.015469       0.0       0.0       1.0         1.0       0.0   
3    0.434531  0.103644       1.0       0.0       0.0         1.0       0.0   
4    0.434531  0.015713       0.0       0.0       1.0         0.0       1.0   
..        ...       ...       ...       ...       ...         ...       ...   
884  0.334004  0.025374       0.0       1.0       0.0         0.0       1.0   
885  0.233476  0.058556       1.0       0.0       0.0         1.0       0.0   
886  0.367204  0.045771       0.0       0.0       1.0         1.0       0.0   
887  0.321438  0.058556       1.0       0.0       0.0         0.0       1.0   
888  0.396833  0.015127       0.0       0.0       1.0         0.0       1.0   

     SibSp_0  SibSp_1  SibSp_2  SibSp_3  SibSp_4  S

In [96]:
#print ('Training set :', X_new)
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data_preprocessed = data_preprocessing(test_data)
fare_mean = test_data_preprocessed['Fare'].mean()
test_data_preprocessed['Fare'] = test_data_preprocessed['Fare'].fillna(fare_mean)
print ('Test data :', test_data_preprocessed.shape)


['Pclass_1' 'Pclass_2' 'Pclass_3' 'Sex_female' 'Sex_male' 'SibSp_0'
 'SibSp_1' 'SibSp_2' 'SibSp_3' 'SibSp_4' 'SibSp_5' 'SibSp_8' 'Embarked_C'
 'Embarked_Q' 'Embarked_S']
['Fare']
(418, 17)
Test data : (418, 17)




In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

param_grid_randomForest = {
    'n_estimators' : [200, 300, 400, 500],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 9, 11]
}
model = RandomForestClassifier()

#initialize gridSearchCV
grid_search = GridSearchCV(estimator = model, param_grid = param_grid_randomForest, cv = 5)
X_new.columns = X_new.columns.astype(str)
#Fit the model to train data
grid_search.fit (X_new, Y)
#Get the best model 
best_model = grid_search.best_estimator_
test_data_preprocessed.columns = test_data_preprocessed.columns.astype(str)
y_pred = best_model.predict(test_data_preprocessed)

#calculate the accuracy
#accuracy = accuracy_score (Y_test, y_pred)
#print(accuracy)

In [105]:
#lets try with XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


X_new.columns = X_new.columns.astype(str)
param_grid_xgboost = {
    'n_estimators' : [100, 200, 300, 400],
    'max_depth' : [3, 5, 7, 9]
}
#initialize gridSearchCV
model = XGBClassifier()

grid_search = GridSearchCV(estimator = model, param_grid = param_grid_xgboost, cv = 5)

grid_search.fit(X_new, Y)
test_data_preprocessed.columns = test_data_preprocessed.columns.astype(str)
#Get the best model 
best_model_xgb = grid_search.best_estimator_
y_pred = best_model_xgb.predict(test_data_preprocessed)

In [106]:
print (y_pred)
test_data_columns = pd.read_csv("/kaggle/input/titanic/test.csv")
submission = pd.DataFrame({'PassengerId':test_data_columns['PassengerId'],'Survived':y_pred})
#Visualize the first 5 rows
submission.head()

[0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1
 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 0 1 0 1 1 0 1 0 0 0]


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [107]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: submission.csv
