<a href="https://colab.research.google.com/github/saurabhvmac/KaggleProjects/blob/main/Titanic_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import the Libraries

In [None]:
import pandas as pd
import numpy as np 

import seaborn as sns 
import matplotlib.pyplot as plt
import missingno

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Read the CSVS

In [None]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
gender_submission = pd.read_csv("/content/gender_submission.csv")

# Preprocessing

## Missing Values

In [None]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
#drop unuseful features cabin column
train = train.drop(columns=["Cabin","PassengerId",'Ticket','Name','Fare'], axis=1)

In [None]:
train.Age.fillna(train.Age.mean(), inplace=True)

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


## One hot Encoding for sex, embarked and pclass

In [None]:
embarked_enc = pd.get_dummies(train['Embarked'], prefix='embarked')
pclass_enc = pd.get_dummies(train['Pclass'], prefix='pclass')
Sex_enc = pd.get_dummies(train['Sex'], prefix='sex')
#concatinate and drop the columns
train = pd.concat([train, embarked_enc,pclass_enc,Sex_enc], axis=1)
train = train.drop(['Embarked','Pclass','Sex'], axis=1)

In [None]:
train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,embarked_C,embarked_Q,embarked_S,pclass_1,pclass_2,pclass_3,sex_female,sex_male
0,0,22.0,1,0,0,0,1,0,0,1,0,1
1,1,38.0,1,0,1,0,0,1,0,0,1,0
2,1,26.0,0,0,0,0,1,0,0,1,1,0
3,1,35.0,1,0,0,0,1,1,0,0,1,0
4,0,35.0,0,0,0,0,1,0,0,1,0,1


## Spliting the data into x and y

In [None]:
x_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']

In [None]:
#standard Scaling x_train
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
X_train = scaling.fit_transform(x_train)
X_train

array([[-5.92480600e-01,  4.32793366e-01, -4.73673609e-01, ...,
         9.02587365e-01, -7.37695132e-01,  7.37695132e-01],
       [ 6.38789012e-01,  4.32793366e-01, -4.73673609e-01, ...,
        -1.10792599e+00,  1.35557354e+00, -1.35557354e+00],
       [-2.84663197e-01, -4.74545196e-01, -4.73673609e-01, ...,
         9.02587365e-01,  1.35557354e+00, -1.35557354e+00],
       ...,
       [-2.23290646e-16,  4.32793366e-01,  2.00893337e+00, ...,
         9.02587365e-01,  1.35557354e+00, -1.35557354e+00],
       [-2.84663197e-01, -4.74545196e-01, -4.73673609e-01, ...,
        -1.10792599e+00, -7.37695132e-01,  7.37695132e-01],
       [ 1.77062908e-01, -4.74545196e-01, -4.73673609e-01, ...,
         9.02587365e-01, -7.37695132e-01,  7.37695132e-01]])

# Training with Decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
dtc = DecisionTreeClassifier()
model = dtc.fit(X_train, y_train)
accuracy = round(model.score(X_train, y_train) *100, 2)
y_predicted = cross_val_predict(dtc, X_train, y_train, cv=5,n_jobs=-1)
cv_accuracy = round(accuracy_score(y_train, y_predicted)*100, 2)

print("Accuracy = ", accuracy)
print('CV_accuracy=', cv_accuracy)

Accuracy =  93.71
CV_accuracy= 79.35


# Making our final predictions on the test set

In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Preprocessing for test

In [None]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
test.Age.fillna(train.Age.mean(), inplace=True)
test.Age = test.Age.astype(int)

In [None]:
#passanger id

In [None]:
from sklearn.preprocessing import StandardScaler
embarked_enc = pd.get_dummies(test['Embarked'], prefix='embarked')
pclass_enc = pd.get_dummies(test['Pclass'], prefix='pclass')
Sex_enc = pd.get_dummies(test['Sex'], prefix='sex')
#concatinate and drop the columns
test = pd.concat([test, embarked_enc,pclass_enc,Sex_enc], axis=1)
test = test.drop(['Embarked','Pclass','Sex','Name','Ticket','Cabin','Fare'], axis=1)


In [None]:
test.isnull().sum()

PassengerId    0
Age            0
SibSp          0
Parch          0
embarked_C     0
embarked_Q     0
embarked_S     0
pclass_1       0
pclass_2       0
pclass_3       0
sex_female     0
sex_male       0
dtype: int64

In [None]:
#the required cols for prediction
required_cols = x_train.columns
required_cols

Index(['Age', 'SibSp', 'Parch', 'embarked_C', 'embarked_Q', 'embarked_S',
       'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male'],
      dtype='object')

In [None]:
#standard scaling test set
scaled_test = scaling.fit_transform(test[required_cols])
scaled_test

array([[ 0.317377  , -0.49947002, -0.4002477 , ...,  0.95782629,
        -0.75592895,  0.75592895],
       [ 1.345164  ,  0.61699237, -0.4002477 , ...,  0.95782629,
         1.32287566, -1.32287566],
       [ 2.53107208, -0.49947002, -0.4002477 , ..., -1.04403065,
        -0.75592895,  0.75592895],
       ...,
       [ 0.63361915, -0.49947002, -0.4002477 , ...,  0.95782629,
        -0.75592895,  0.75592895],
       [-0.0779257 , -0.49947002, -0.4002477 , ...,  0.95782629,
        -0.75592895,  0.75592895],
       [-0.0779257 ,  0.61699237,  0.61989583, ...,  0.95782629,
        -0.75592895,  0.75592895]])

## Final Prediction on test set

In [None]:
predict = dtc.predict(scaled_test)

## adding predicted values and passengerid to submission data frame and then submission df to a csv

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = predict 
submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1


In [None]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [None]:
len(gender_submission)

418

In [None]:
len(submission)

418

In [None]:
survived_con = (submission['Survived']==gender_submission['Survived'])
survived_con.value_counts()

True     334
False     84
Name: Survived, dtype: int64

In [None]:
submission.to_csv("../content/submission.csv")

In [None]:
index_id = [i for i in range(1,11)]
index_id = pd.Series(index_id, name='Index')
index_id.head()

0    1
1    2
2    3
3    4
4    5
Name: Index, dtype: int64

In [None]:
main_id = [i+1 for i,value in enumerate(index_id)]

main_id = pd.Series(main_id, name='MAIN')
main_id.head()

0    1
1    2
2    3
3    4
4    5
Name: MAIN, dtype: int64

In [None]:
import pandas as pd
submission = pd.concat([index_id, main_id], axis=1)
submission.to_csv('samplesub.csv', index=False)
submission.head()

Unnamed: 0,Index,MAIN
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
