In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Dataset

In [None]:
# Read the train and test Data
test=pd.read_csv('../input/titanic/test.csv')
train=pd.read_csv('/kaggle/input/titanic/train.csv')

# Data Analysis

In [None]:
# print top 5 rows
print('Top 5 Rows')
train.head()


In [None]:
#print top 5 row of test data
print('Top 5 Rows')
test.head()


In [None]:
# Train data set size
print("Train Data Set Size:",train.shape)
print("Total no of data points in Train Data:",train.shape[0])

In [None]:
#print Test dataset size
print("Test Data Set Size:",test.shape)
print("Total no of data points in Test Data:",test.shape[0])

In [None]:
Train_Survived_Distribution=train["Survived"].value_counts()


In [None]:
Train_Survived_Distribution.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in train data')
plt.grid()
plt.show()
sorted_yi = np.argsort(-Train_Survived_Distribution.values)
print(sorted_yi )
print(Train_Survived_Distribution.values )
for i in sorted_yi:
    print('Number of data points in class', i, ':',Train_Survived_Distribution.values[i], '(', np.round((Train_Survived_Distribution.values[i]/train.shape[0]*100), 3), '%)')
print('Observation:','Dataset is imbalanced Data')   

# Data preprocessing

In [None]:
print('--'*10,"Verify null values in Train Data set",'--'*10) 
print(train.isnull().sum())
print('--'*40)
print(' '*40)
print('--'*40)
print('--'*10,"Verify null values in Test Data set",'--'*10) 
print(test.isnull().sum())
print('--'*40)
print('Observation:','Age, Cabin and Embarked features are  having Null values in Train Data')
print('Observation:','Age, Cabin and Fari features are  having Null values in Test Data')

In [None]:
def data_clean(data):
    Total =data.isnull().sum().sort_values()
    Percent=((data.isnull().sum()/(data.isnull().count())*100)).sort_values().round(2)
    d1=pd.concat([Total,Percent],axis=1,keys=['Total','Percent'])
    d1=d1[d1['Percent']>0]
    fig=plt.subplots(figsize=(8,5))
    fig=sns.barplot(d1.index,d1.Percent)
    plt.ylabel("Percentage of  Missing values ",fontsize=15)
    plt.xlabel("Features",fontsize=15)
    plt.title("Percentage Of Missing values in  Data",fontsize=15)
    return d1
    

In [None]:
data_clean(train)

In [None]:
data_clean(test)

Cabin feature has more than 77% of   missing values in both Train and Test Dataset. So I can Remove the cabin feature 

In [None]:
# drop the Cabin variable in both datasets
train.drop(['Cabin'],axis=1,inplace=True)
test.drop(["Cabin"],axis=1,inplace=True)

I will remove the PassengerId ,Name and Ticket, since i will be useless for our data

In [None]:
train.drop(["PassengerId","Name","Ticket"],axis=1,inplace=True)
test.drop(["PassengerId","Name","Ticket"],axis=1,inplace=True)

Age feature has more than 17% missing values in both train and test data, so i have filling age fature with median

In [None]:
train['Age'].fillna(train['Age'].mean(),inplace=True)
test['Age'].fillna(train['Age'].mean(),inplace=True)
test['Fare'].fillna(test['Fare'].mean(),inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)

In [None]:
print('--'*10,"Verify null values in Train Data set",'--'*10) 
print(train.isnull().sum())
print('--'*40)
print('--'*40)
print('--'*10,"Verify null values in Test Data set",'--'*10) 
print(test.isnull().sum())
print('--'*40)
print('Observation:','No Null values in Train Data')
print('Observation:','No Null values in Test Data')

# Feature engineering and EDA

1. Sex Feature

Sex,What type of feature,it is?

Ans: Sex is Categorical variable

In [None]:
unique_sex = train['Sex'].value_counts()
print("There are", unique_sex.shape[0] ,"different categories of sex feature in the train data",)
print(unique_sex)
sns.countplot(x='Sex',data=train)
plt.show()

In [None]:
unique_sex = test['Sex'].value_counts()
print("There are", unique_sex.shape[0] ,"different categories of sex feature in the test data",)
print(unique_sex)
sns.countplot(x='Sex',data=test)
plt.show()

1. How to featurize this Sex feature?

Ans:Using Get_dummies(), we can Featurize this variable

In [None]:
train=pd.get_dummies(train,columns=['Sex'],prefix=['Sex'])

In [None]:
(train.head())

In [None]:
test = pd.get_dummies(test, columns = ["Sex"],prefix=["Sex"])

In [None]:
unique_embarked = test['Embarked'].value_counts()
print("There are", unique_embarked.shape[0] ,"different categories of Embarked feature in the test data",)
print(unique_embarked)
sns.countplot(x='Embarked',data=test)
plt.show()

2. Embarked Feature

Embarked,What type of feature,it is?

Ans: Embarked is Categorical variable

In [None]:
unique_embarked = train['Embarked'].value_counts()
print("There are", unique_embarked.shape[0] ,"different categories of Embarked feature in the train data",)
print(unique_embarked)
sns.countplot(x='Embarked',data=train)
plt.show()

2.1 How to featurize this Embarked feature?

Ans:Using Get_dummies(), we can Featurize this variable

In [None]:
train=pd.get_dummies(train,columns=['Embarked'],prefix=['Embarked'])
test=pd.get_dummies(test,columns=['Embarked'],prefix=['Embarked'])

In [None]:
train.head()

In [None]:
test.head()

3. Age Feature 

   Age,What type of feature it is?
   
   ans: Age is  Continuous fature.
   
For improve the accuracy purpose, Age variable values are divied into bins using cut method



In [None]:
print(train['Age'].max())
print(train['Age'].min())

In [None]:
print(test['Age'].max())
print(test['Age'].min())

3.1 How to featurize this Embarked feature?

Ans:Using Get_dummies(), we can Featurize this variable

In [None]:
train['Age']=pd.cut(train['Age'],bins=[0,12,20,40,100],labels=['Childen','Teenage','Adilt','Elder'])
test['Age']=pd.cut(test['Age'],bins=[0,12,20,40,100],labels=['Children','Teenage','Adult','Elder'])

In [None]:
test.head()

In [None]:
train.head()

In [None]:
unique_age=train.Age.value_counts()
print("There are", unique_age.shape[0] ,"different categories of age feature in the train data")
print(unique_age)
sns.countplot(x='Age',data=train)
plt.title('Bins of Age feature in Train data')
plt.show()

In [None]:
unique_age=test.Age.value_counts()
print("There are", unique_age.shape[0] ,"different categories of age feature in the test data")
print(unique_age)
sns.countplot(x='Age',data=test)
plt.title('Bin of Age feature in Test data')
plt.show()

In [None]:
train=pd.get_dummies(train,columns=['Age'],prefix=['Age'])
test=pd.get_dummies(test,columns=['Age'],prefix=['Age'])

In [None]:
train.head()

In [None]:
test.head()

4. Fare Feature 

   Fare,What type of feature it is?
   
   ans: Fare is  Continuous fature.
   
For improve the accuracy purpose, Age variable values is divied into bins using cut method

In [None]:
print(train['Fare'].min())
print(train['Fare'].max())

In [None]:
print(test['Fare'].min())
print(test['Fare'].max())

4.1 How to featurize this Embarked feature?

Ans:Using Get_dummies(), we can Featurize this variable

In [None]:
train['Fare']=pd.cut(train['Fare'],bins=[0,14.45,31,60,513],labels=['Low Fare','median Fare','Average Fare','high Fare'])
test['Fare']=  pd.cut(test['Fare'],bins=[0,14.45,31,60,513],labels=['Low Fare','median Fare','Average Fare','high Fare'])

In [None]:
(train.head())

In [None]:
unique_fare=train.Fare.value_counts()
print("There are", unique_fare.shape[0] ,"different categories of Fare feature in the train data")
print(unique_fare)
sns.countplot(x='Fare',data=train)
plt.title('Bin of Fare feature in Train data')
plt.show()

In [None]:
unique_Fare=test.Fare.value_counts()
print("There are", unique_Fare.shape[0] ,"different categories of Fare feature in the test data")
print(unique_age)
sns.countplot(x='Fare',data=test)
plt.title('Bin of Fare feature in Test data')
plt.show()

In [None]:
train=pd.get_dummies(train,columns=['Fare'],prefix=['Fare'])
test=pd.get_dummies(test,columns=['Fare'],prefix=['Fare'])

In [None]:
train.head()

In [None]:
train.head()

In [None]:
test.head(10)

In [None]:
data=train
print("Train Data Set Size:",data.shape)
print("Total no of data points in Train Data:",data.shape[0])

In [None]:
print("features:",data.columns.values)

In [None]:
Y=data["Survived"]
X=data.drop("Survived",axis=1)

In [None]:
print(Y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
#from sklearn.grid_search import GridSearchCV

# Spliting  Dataset

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

# Models

Logistic Regression

In [None]:
lr=LogisticRegression()
lr.fit(X_train,Y_train)

In [None]:
turned_parameters=[{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]

In [None]:
lrgscv=GridSearchCV(lr,turned_parameters,cv=10,scoring = 'accuracy')


In [None]:
lrgscv.fit(X_train,Y_train)
predict_val1=lrgscv.predict(X_test)
predict_val=lr.predict(X_test)

In [None]:
lr_score=cross_val_score(lr,X,Y,cv=10,scoring='accuracy')
predictlr=cross_val_predict(lr,X,Y,cv=10)

In [None]:

print('--------------The Accuracy of the model----------------------------')
print("Accuracy of Logistic Regression is :",round(accuracy_score(Y_test,predict_val)*100,2),'%')
print("Cross validation score for Logistic Regression Accuracy is:",round(lr_score.mean()*100,2),'%')
print("Grid Search CV score for Logistic Regression Accuracy is:",round(accuracy_score(Y_test,predict_val1)*100,2),'%')

In [None]:
sns.heatmap(confusion_matrix(Y,predictlr),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

In [None]:
print(lrgscv.best_estimator_)
#print(lrgscv.score(X,Y))
print(lrgscv.best_score_)

KNN

In [None]:
knn=KNeighborsClassifier()
knn.fit(X_train,Y_train)
knnpredict=knn.predict(X_test)

In [None]:
knn_score=cross_val_score(knn,X,Y,cv=10,scoring='accuracy')
predictknn=cross_val_predict(knn,X,Y,cv=10)

In [None]:
param_grid={'n_neighbors':[3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51],'p':[1,2],
           'weights': ['uniform', 'distance']}
model = GridSearchCV(knn, param_grid, scoring = 'accuracy', cv=10)
model.fit(X_train, Y_train)
knngrcv=model.predict(X_test)
print(model.best_estimator_)
print(model.score(X_test, Y_test))

In [None]:
print('--------------The Accuracy of the model----------------------------')
print("Accuracy of KNN is :",round(accuracy_score(Y_test,knnpredict)*100,2),'%')
print("Cross validation score for KNN Accuracy is:",round(knn_score.mean()*100,2),'%')
print("Grid Search CV score for KNN Accuracy is:",round(accuracy_score(Y_test,knngrcv)*100,2),'%')


In [None]:
sns.heatmap(confusion_matrix(Y,predictknn),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

 SVM 

In [None]:
svc=SVC()
svcclf=svc.fit(X_train,Y_train)
svcpredict=svcclf.predict(X_test)
svcpredict1=svcclf.predict(test)

In [None]:
svc_score=cross_val_score(svc,X,Y,cv=10,scoring='accuracy')
predictsvc=cross_val_predict(svc,X,Y,cv=10)

In [None]:
"""turn_perameters={'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid','linear']}
svmclf=GridSearchCV(SVC(),turn_perameters,cv=10)
svmclf.fit(X_train,Y_train)
svmpredict=svmclf.predict(X_test)
print(svmclf.best_estimator_)
print(svmclf.best_score_)
print(svmclf.score(X_test, Y_test))"""

In [None]:
sns.heatmap(confusion_matrix(Y,predictsvc),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

 Naive Bayes

In [None]:
nb=GaussianNB()
nb.fit(X_train,Y_train)
nbclf=nb.predict(X_test)


In [None]:
nb_score=cross_val_score(nb,X,Y,scoring='accuracy',cv=10)
predictnb=cross_val_predict(nb,X,Y,cv=10)
#print(nb.best_score_)
#print(nb.best_estimator_)

In [None]:
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of Naive Bayes is :",round(accuracy_score(Y_test,nbclf)*100,2),'%')
print("Cross validation score for Naive Bayes is:",round(nb_score.mean()*100,2),'%')
#print("Best Cross validation score for SVM Accuracy is:",round((nb.best_score_)*100,2),'%')

In [None]:
sns.heatmap(confusion_matrix(Y,predictnb),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

  Decision Tree

In [None]:
dt=DecisionTreeClassifier()
dt.fit(X_train,Y_train)
dtpredict=dt.predict(X_test)

In [None]:
dt_score=cross_val_score(dt,X,Y,cv=10,scoring='accuracy')
predictdt=cross_val_predict(dt,X,Y,cv=10)

In [None]:
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of Decision Tree is :",round(accuracy_score(Y_test,dtpredict)*100,2),'%')
print("Cross validation score for Decision Tree is:",round(dt_score.mean()*100,2),'%')


In [None]:
sns.heatmap(confusion_matrix(Y,predictdt),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

Random Forest

In [None]:
rfclf=RandomForestClassifier()
rfclf.fit(X_train,Y_train)
rfpredict=rfclf.predict(X_test)

In [None]:
rf_score=cross_val_score(rfclf,X,Y,cv=10,scoring='accuracy')
predictdt=cross_val_predict(rfclf,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,predictdt),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

In [None]:
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of Random Forest  is :",round(accuracy_score(Y_test,rfpredict)*100,2),'%')
print("Cross validation score for Random Forest is:",round(rf_score.mean()*100,2),'%')


XGBoosting

In [None]:
xgb=XGBClassifier()
xgb.fit(X_train,Y_train)
xgbpridect=xgb.predict(X_test)

In [None]:
xgb_score=cross_val_score(xgb,X,Y,cv=10,scoring='accuracy')
predictxgb=cross_val_predict(xgb,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,predictxgb),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

In [None]:
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of XGBoosting is :",round(accuracy_score(Y_test,xgbpridect)*100,2),'%')
print("Cross validation score for XGBoosting is:",round(xgb_score.mean()*100,2),'%')


 Adaboosting

In [None]:
adbclf=AdaBoostClassifier()
adbclf.fit(X_train,Y_train)
adbpredict=adbclf.predict(X_test)

In [None]:
adb_score=cross_val_score(adbclf,X,Y,cv=10,scoring='accuracy')
predictadbclf=cross_val_predict(adbclf,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,predictadbclf),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

In [None]:
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of Adaboosting is :",round(accuracy_score(Y_test,adbpredict)*100,2),'%')
print("Cross validation score for Adaboosting  is:",round(adb_score.mean()*100,2),'%')


In [None]:
gbclf=GradientBoostingClassifier()
gbclf.fit(X_train,Y_train)
gbclfpredict=gbclf.predict(X_test)
gbclfpredict1=gbclf.predict(test)

In [None]:
gb_score=cross_val_score(gbclf,X,Y,cv=10,scoring='accuracy')
predictadbclf=cross_val_predict(gbclf,X,Y,cv=10)
sns.heatmap(confusion_matrix(Y,predictadbclf),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")
plt.show()

In [None]:
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of Gradient Boosting is :",round(accuracy_score(Y_test,gbclfpredict)*100,2),'%')
print("Cross validation score for Gradient Boosting is:",round(gb_score.mean()*100,2),'%')


In [None]:
"""sclf=StackingClassifier(classifiers=[knn,rfclf,nb,svc,dt,xgb,adbclf,gbclf],meta_classifier=lr)
for clf, label in zip([knn,rfclf,nb,svc,dt,xgb,adbclf,gbclf,sclf],
                      ['KNN',
                       'Random Forest',
                       'Naive Bayes',
                       'SVM',
                       'Decision Tree',
                       'XGBoost',
                       'Adaboosting',
                       'GradientBoosting',
                       'StackingClassifier']):
    scores=cross_val_score(clf,X,Y,cv=10,scoring='accuracy')
    print("Accuracy:%0.2f  [%s]" %(scores.mean(),label))
    """

StackingClassifier

In [None]:
sclf=StackingClassifier(classifiers=[knn,rfclf,nb,svc,dt,xgb,adbclf,gbclf],meta_classifier=lr)
sclf.fit(X_train,Y_train)
sclfpredict=sclf.predict(X_test)
stck_score=cross_val_score(sclf,X,Y,cv=10,scoring='accuracy')
predictadbclf=cross_val_predict(sclf,X,Y,cv=10)
print('--------------------The Accuracy of the model--------------------------')
print("Accuracy of Stacking is :",round(accuracy_score(Y_test,sclfpredict)*100,2),'%')
print("Cross validation score for Stacking is:",round(stck_score.mean()*100,2),'%')

sns.heatmap(confusion_matrix(Y,predictadbclf),annot=True,fmt='3.0F')
plt.title("Confusion Matrix")

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'AdaBoostClassifier', 
              'Gradient Decent', 'XGBoosting', 
              'Decision Tree','Stacking Classifier'],
    'Score': [svc_score.mean(), knn_score.mean(), lr_score.mean(), 
              rf_score.mean(), nb_score.mean(), adb_score.mean(), 
              gb_score.mean(), xgb_score.mean(), dt_score.mean(),stck_score.mean()]})
models.sort_values(by='Score',ascending=False)

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'AdaBoostClassifier', 
              'Gradient Decent', 'XGBoosting', 
              'Decision Tree','Stacking Classifier'],
    'Accuracy': [accuracy_score(Y_test,svcpredict), accuracy_score(Y_test,knnpredict),accuracy_score(Y_test,predict_val), 
              accuracy_score(Y_test,rfpredict), accuracy_score(Y_test,nbclf), accuracy_score(Y_test,adbpredict), 
              accuracy_score(Y_test,gbclfpredict), accuracy_score(Y_test,xgbpridect),accuracy_score(Y_test,dtpredict),accuracy_score(Y_test,sclfpredict)]})
models.sort_values(by='Accuracy',ascending=False)

In [None]:
pred=pd.DataFrame(gbclfpredict1)
submsdf=pd.read_csv('../input/titanic/gender_submission.csv')


In [None]:
submsdf.head()

In [None]:
print(pred.shape)
print(submsdf.shape)

In [None]:
datasets=pd.concat([submsdf['PassengerId'],pred],axis=1)
datasets.columns=['PassengerId','Survived']
datasets.to_csv('resultdf3.csv',index=False)


In [None]:
#d=pd.read_csv('C://Users//user//Desktop//resultdf3.csv')
#print(d.head())