In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the dataset

In [1]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test= pd.read_csv("/kaggle/input/titanic/test.csv")

In [1]:
df_train.head()

In [1]:
print("Train shape :",df_train.shape)
print("Test shape:", df_test.shape)

## Data Preprocessing

In [1]:
df_train.describe(include="all")

Columns like "PassengerId", "Name", "Ticket" may not needed for the training, so droping it on df_train, df_test

In [1]:
df_train.columns

In [1]:
col = ["PassengerId","Name","Ticket"]
df_train = df_train.drop(col,axis=1)
df_test = df_test.drop(col,axis=1)

In [1]:
df_train.head()

In [1]:
df_test.head()

## Checking for missing values in train and test data

In [1]:
df_train.isnull().sum().sort_values(ascending=False)

In [1]:
df_train.isnull().sum().sort_values(ascending=False)

## checking the unique values in Cabin

In [1]:
df_train["Cabin"].unique()

We can see that It starts with an Alphabet, so we can use it for training

Out of 891 rows, column "Cabin" has 687 missing values in training data and test data. 
Since it is more than 50% of the data, we can change the values from NaN to some other variable and consider it as data not available

We replace it as "No cabin"


In [1]:
df_train["Cabin"].fillna("No cabin",inplace=True)
df_test["Cabin"].fillna("No cabin",inplace=True)

In [1]:
df_train['Cabin'].unique()

We have alphanumeric characters in Cabin column, So we are extracting the letter from value and replacing it

In [1]:
for i in range(0,len(df_train)):
    df_train['Cabin'][i] = df_train['Cabin'][i][:1]
for i in range(0,len(df_test)):
    df_test['Cabin'][i] = df_train['Cabin'][i][:1]

In [1]:
df_train['Cabin'].unique()

In [1]:
df_test["Cabin"].unique()

These are the updated values in "Cabin" and 'N' represents "No cabin"

In [1]:
df_train.isnull().sum().sort_values(ascending=False)

In [1]:
df_test.isnull().sum().sort_values(ascending=False)

We have missing values in "Age" columns. so, we replace the values with the mean of the respective data

In [1]:
df_train.head()

Filling the missing values in "Age" columns using the mean value

In [1]:
df_train["Age"].fillna(df_train["Age"].mean(),inplace=True)
df_test["Age"].fillna(df_test["Age"].mean(),inplace=True)

In [1]:
df_train.isnull().sum().sort_values(ascending=False)

In [1]:
df_test.isnull().sum().sort_values(ascending=False)

Now, we have 2 missing values for "Embarked" column in train data and 1 missing value for "Fare" in test data, so we replace the missing values using their respective mean value for "Fare" and mode(Highest Occurence) for "Embarked" column

In [1]:
df_train["Embarked"].fillna(df_train["Embarked"].mode()[0],inplace=True)
df_test["Fare"].fillna(df_train["Fare"].mean(),inplace=True)

In [1]:
df_train.isnull().sum().sort_values(ascending=False)

In [1]:
df_test.isnull().sum().sort_values(ascending=False)

The data does not have any missing values now 

## Converting Catogorical Features into Numberical features

The categorical features are Sex, Cabin, Embarked

### Sex

We create a dictionary for the corresponding Categorical features with its Numerical values

#### 0 for Male


#### 1 for Female

In [1]:
gender = {"male":0,"female":1}
gender

In [1]:
df_train["Sex"]=df_train["Sex"].map(gender)
df_test["Sex"]= df_test["Sex"].map(gender)

In [1]:
df_train.head()

## Cabin

We create a dictionary for the corresponding Categorical features with its Numerical values

In [1]:
df_train.Cabin.unique()

We have eight cabin, so we want to map with 8 values

In [1]:
cab = {'N':0, 'C':1, 'E':2, 'G':3, 'D':4, 'A':5, 'B':6, 'F':7, 'T':8}
cab

In [1]:
df_train["Cabin"] = df_train["Cabin"].map(cab)
df_test["Cabin"] = df_test["Cabin"].map(cab)

In [1]:
df_train.head()

## Embarked

We create a dictionary for the corresponding Categorical features with its Numerical values

In [1]:
df_train["Embarked"].unique()

We have three values in Embarked, so we want to map with 3 values

In [1]:
emb = {'S':0, 'C':1, 'Q':2}
emb

In [1]:
df_train["Embarked"] = df_train["Embarked"].map(emb)
df_test["Embarked"] = df_test["Embarked"].map(emb)

In [1]:
df_train.head()

In [1]:
df_test.head()

### Now, we have changed all the Categorical values into Numerical values

In [1]:
df_train.info()

In [1]:
df_test.info()

# Visualization

In [1]:
import matplotlib.pyplot as plt

In [1]:
for i in df_train.columns:
        plt.hist(df_train[i],bins=10,color='green')
        plt.xlabel(i)
        plt.ylabel("count")
        plt.title(i)
        plt.show()

## Splitting the target variable

In [1]:
x=df_train.drop(["Survived"],1)
y = df_train["Survived"]

In [1]:
x

In [1]:
y

## Applying Standard Scaling

In [1]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
standard_df = scaler.fit_transform(x)
X = pd.DataFrame(standard_df,columns=x.columns)
X

In [1]:
X.describe()

### For test data

In [1]:
standard_df = scaler.fit_transform(x)
test = pd.DataFrame(standard_df,columns=df_test.columns)
test

In [1]:
test

## Splitting Train and Test data

In [1]:
from sklearn.model_selection import train_test_split

In [1]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [1]:
print("Training data:",x_train.shape)
print("Testing data:",x_test.shape)

## Model Training

### Logistic regression

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [1]:
lr = LogisticRegression()
lr

In [1]:
# Model Training
lr.fit(x_train, y_train)

In [1]:
# Hyper parameter tuning
from sklearn.model_selection import GridSearchCV
param = {
         'penalty':['l1','l2'],
         'C':[0.001, 0.01, 0.1, 1, 10, 20,100, 1000]
}
Lr= LogisticRegression(penalty='l1')
cv=GridSearchCV(lr,param,cv=5,n_jobs=-1)
cv.fit(x_train,y_train)
y_pred=cv.predict(x_test)

In [1]:
cv.best_score_

In [1]:
print("Accuracy score:",accuracy_score(y_test,y_pred))

### XGBOOST Classifier

In [1]:
from xgboost import XGBClassifier

xgb = XGBClassifier(booster = 'gbtree', gamma=5,learning_rate = 0.1, max_depth = 5, n_estimators = 100,colsample_bytree=1)
xgb.fit(x_train, y_train)

In [1]:
y_pred = xgb.predict(x_test)
print("Accuracy on test data:",accuracy_score(y_test,y_pred))
print("Accuracy on train data:",accuracy_score(y_train,xgb.predict(x_train)))

## Gradient Boosting

In [1]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(max_depth=3)
gbc.fit(x_train, y_train)

In [1]:
print("Accuracy on test data:",accuracy_score(y_test,gbc.predict(x_test)))
print("Accuracy on train data:", accuracy_score(y_train,gbc.predict(x_train)))

## Final Predictions

In [1]:
fin = gbc.predict(test)

In [1]:
fin

In [1]:
predictions = pd.DataFrame(fin)

In [1]:
predcsv = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
predcsv["Survived"] = predictions
predcsv.to_csv("Final prediction.csv",index=False)