<div class="alert alert-block alert-success">
    <h1 align="center">Machine Learning in Python</h1>
    <h3 align="center">Titanic Multi Model</h3>
</div>

<img src = "https://storage.googleapis.com/kaggle-competitions/kaggle/3136/logos/front_page.png" width=50%>

## Importing the libraries

In [None]:
#Let's Import the Packages...
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
import warnings
warnings.filterwarnings('ignore')

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">load Data </span></strong>
<p style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;"> 
</div>

## Load 

In [None]:
#Let's Read csv file
titanic = pd.read_csv("../input/titanic/train.csv")
#Show some records of dataframe
titanic.sample(5)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">Prepare Data and EDA </span></strong>
<p style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;"> 
</div>

In [None]:
titanic.index

In [None]:
# get information about data!!
titanic.info()
# In this data some features are in the form object .However, except for gender, others have no effect survived.

In [None]:
# We choose the features that affect survival
titanic_select = titanic.loc[:,['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
titanic_select

In [None]:
# get information new dataset
titanic_select.info()
# We have two problems: 1- Sex and Embarked is object 2- Age  and Emarked have mis data

In [None]:
#  check missing Values 
titanic_select.isnull().sum()
# Age have 177 missing value

In [None]:
#Examine the miss data visually to select the best strategy
sns.heatmap(titanic_select.isnull(), cbar = False,cmap="YlGnBu").set_title("Missing values ")
# The miss data are evenly distributed. So we use the mean 

In [None]:
# fill missing values with mean column values
titanic_select.fillna(titanic_select.mean(), inplace=True)
# count the number of NaN values in each column
titanic_select.isnull().sum()

In [None]:
# drop rows with missing values (2 row2 NAN in Embarked)
titanic_select.dropna(inplace = True)

In [None]:
titanic_select.isnull().sum()

In [None]:
titanic_select.info()

In [None]:
titanic_select

In [None]:
# ENCODING Convert object to int
titanic_select.replace({ 'Sex': {'male':0 , 'female':1} , 'Embarked' : {'C': 0 , 'S': 1, 'Q':2}} ,inplace=True)

In [None]:
#Convert Age float to int
titanic_select['Age'] = titanic_select['Age'].astype(int)
titanic_select

In [None]:
# convert fare round anf float to int
titanic_select["Fare"] = np.round(titanic_select["Fare"])
titanic_select["Fare"] = titanic_select["Fare"].astype(int)

In [None]:
titanic_select.info()

In [None]:
# Finding out the correlation between the features
corr = titanic_select.corr()
corr.shape

In [None]:
# Plotting the heatmap of correlation between features
plt.figure(figsize=(25,25))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='YlGnBu')

In [None]:
# crosstab between Pclass,Sex,Embarked,SibSp,Parch and Survived
pd.crosstab(titanic_select.Pclass,titanic_select.Survived,normalize = "index" )

In [None]:
pd.crosstab(titanic_select.Sex,titanic_select.Survived,normalize = "index" )


In [None]:
pd.crosstab(titanic_select.Embarked,titanic_select.Survived,normalize = "index" )

In [None]:
pd.crosstab(titanic_select.SibSp,titanic_select.Survived,normalize = "index" )

In [None]:
pd.crosstab(titanic_select.Parch,titanic_select.Survived,normalize = "index" )

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">Visualization </span></strong>
<p style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;"> 
</div>

In [None]:
print('seaborn: %s' % sns.__version__)

In [None]:
# number of total Servived sex
sns.displot(data=titanic, x="Survived",hue='Sex') 

In [None]:
sns.displot(data=titanic, x="Survived",hue='Embarked',kind ='kde') 

In [None]:
sns.displot(data=titanic, x="Survived",hue='Parch',kind ='kde')

In [None]:
sns.displot(data=titanic_select, x="Fare",hue='Survived',kind ='kde')

In [None]:
sns.displot(data=titanic_select, x="Parch",hue='Survived',kind ='kde')

In [None]:
sns.catplot(x="Survived", y="Age", data=titanic_select,kind="swarm")

In [None]:
sns.catplot(x="Survived", y="Fare",hue = 'Sex', data=titanic_select,kind="violin")

In [None]:
sns.catplot(x="Survived", y="SibSp",hue = 'Sex', data=titanic_select,kind="swarm")

In [None]:
sns.factorplot("Pclass", "Survived", "Sex",data=titanic_select, kind="bar",size=6, palette="muted",legend_out=False)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">Train  Models </span></strong>
<p style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">
</div>

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">Logistic Regression </span></strong>

In [None]:
from sklearn.linear_model import LogisticRegression # for Logistic Regression Algorithm
from sklearn.model_selection import train_test_split # to split the dataset for training and testing 
from sklearn import metrics # for checking the model accuracy


In [None]:
X = titanic_select.drop(['PassengerId','Survived'],axis = 1)
y = titanic_select['Survived']
print(X.shape)
print(y.shape)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.25, random_state = 4)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
print('The accuracy of Logistic Regression is: ', (metrics.accuracy_score(y_test, y_pred)))

In [None]:
 #Recall
from sklearn.metrics import recall_score
print('The Recall of Logistic Regression is: ', (metrics.recall_score(y_test, y_pred)))

In [None]:
# Precision
from sklearn.metrics import precision_score
print('The Precision of Logistic Regression is: ', (metrics.precision_score(y_test, y_pred)))

In [None]:
confusion_matrix = pd.crosstab(logreg.predict(X),y)

In [None]:
confusion_matrix

In [None]:
sns.heatmap(confusion_matrix, annot=True)
plt.show()

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">Evaluation logistic_reg(logreg)  on test.cvs and output log_submission.csv </span></strong>
<p style="text-align: center;"><strong><span style="font-family: courier new, courier; font-size: 18pt;">  
</div>

In [None]:
data_test = pd.read_csv('../input/titanic/test.csv')

In [None]:
X_test_d = data_test.loc[:,['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

In [None]:
X_test_d

In [None]:
X_test_d.isnull().sum()

In [None]:
X_test_d.fillna(titanic_select.mean(), inplace=True)

In [None]:
X_test_d.isnull().sum()

In [None]:
X_test_d.replace({ 'Sex': {'male':0 , 'female':1} , 'Embarked' : {'C': 0 , 'S': 1, 'Q':2}} ,inplace=True)
X_test_d['Age'] = X_test_d['Age'].astype(int)
X_test_d["Fare"] = np.round(X_test_d["Fare"])
X_test_d["Fare"] =X_test_d["Fare"].astype(int)
X_test_d

In [None]:
y_PredLogTest=logreg.predict(X_test_d)
y_PredLogTest

In [None]:
Survived = pd.DataFrame(y_PredLogTest, columns = ['Survived'])

In [None]:
csv_input = pd.read_csv('../input/titanic/gender_submission.csv')
csv_input['Survived'] = Survived
csv_input.to_csv('LogesticReg.csv', index=False)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong> <span style="font-family: courier new, courier; font-size: 18pt;"> KNN Regressor</span></strong>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.25, random_state = 4)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred_knn = knn.predict(X_test)

In [None]:
knn.score(X_train, y_train)

In [None]:
knn.score(X_test, y_test)

## Evaluation

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred_knn))

In [None]:
cm =confusion_matrix(y_test, y_pred_knn)
sns.heatmap(cm, square=True , annot=True)

 ## Optimal value of K

In [None]:
error_rate = []
# Might take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=15)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn7 = KNeighborsClassifier(n_neighbors=7)

In [None]:
knn7.fit(X_train, y_train)

In [None]:
y_pred_knn7 = knn.predict(X_test)

In [None]:
knn7.score(X_train, y_train)

In [None]:
knn7.score(X_test, y_test)

# **Evaluation knn(knn7)  on test.cvs and output y_Predknn7Test.csv**

In [None]:
# Evaluation knn(knn7)  on test.cvs and output y_Predknn7Test.csv
y_Predknn7Test=knn7.predict(X_test_d)
y_Predknn7Test
Survived = pd.DataFrame(y_Predknn7Test, columns = ['Survived'])
csv_input = pd.read_csv('../input/titanic/gender_submission.csv')
csv_input['Survived'] = Survived
csv_input.to_csv('y_Predknn7Test.csv', index=False)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong> <span style="font-family: courier new, courier; font-size: 18pt;"> Decision Tree</span></strong>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier_Decicsion = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_Decicsion.fit(X_train, y_train)

In [None]:
y_pred_Decision = classifier_Decicsion.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_Decision)
print(cm)
accuracy_score(y_test,y_pred_Decision)

# **Evaluation    Decision_Tree on test.cvs and output y_Pred_Dici_data.csv**

In [None]:
y_Pred_Dici_data=classifier_Decicsion.predict(X_test_d)
y_Pred_Dici_data
Survived = pd.DataFrame(y_Pred_Dici_data, columns = ['Survived'])
csv_input = pd.read_csv('../input/titanic/gender_submission.csv')
csv_input['Survived'] = Survived
csv_input.to_csv('y_Pred_Dici_data.csv', index=False)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong> <span style="font-family: courier new, courier; font-size: 18pt;"> Random Forest </span></strong>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.25, random_state = 4)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Building  Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion = 'entropy', random_state = 42)
rfc.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score
rfc_pred_test = rfc.predict(X_test)
print('Testing Set Evaluation F1-Score=>',f1_score(y_test,rfc_pred_test))

# **Evaluation   Random Forest on test.cvs and output  y_Pred_rfc_data.csv**

In [None]:
y_Pred_rfc_data=rfc.predict(X_test_d)
y_Pred_rfc_data
Survived = pd.DataFrame(y_Pred_rfc_data, columns = ['Survived'])
csv_input = pd.read_csv('../input/titanic/gender_submission.csv')
csv_input['Survived'] = Survived
csv_input.to_csv('y_Pred_rfc_data.csv', index=False)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong> <span style="font-family: courier new, courier; font-size: 18pt;"> SVM </span></strong>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred_svm = svclassifier.predict(X_test)

In [None]:
y_pred_svm

In [None]:
accuracy = accuracy_score(y_test, y_pred_svm) * 100
print("Accuracy of the Logistic Regression Model: ",accuracy)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred_svm))
print(classification_report(y_test,y_pred_svm))

 # **Evaluation    SVM on test.cvs and output y_Pred_svm_data.csv**

In [None]:
y_Pred_svm_data=svclassifier.predict(X_test_d)
y_Pred_svm_data
Survived = pd.DataFrame(y_Pred_svm_data, columns = ['Survived'])
csv_input = pd.read_csv('../input/titanic/gender_submission.csv')
csv_input['Survived'] = Survived
csv_input.to_csv('y_Pred_svm_data.csv', index=False)

<div class="alert alert-block alert-info" dir="ltr" style="text-align: center;"><strong> <span style="font-family: courier new, courier; font-size: 18pt;"> XGBoost </span></strong>

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.25, random_state = 4)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
y_pred_XGB = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test,y_pred_XGB) * 100
print("Accuracy of the Logistic Regression Model: ",accuracy)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred_XGB))
print(classification_report(y_test,y_pred_XGB))

# **Evaluation    XGBoost on test.cvs and output y_XGB_svm_data.csv**

In [None]:
y_Pred_XGB_data=svclassifier.predict(X_test_d)
y_Pred_XGB_data
Survived = pd.DataFrame(y_Pred_XGB_data, columns = ['Survived'])
csv_input = pd.read_csv('../input/titanic/gender_submission.csv')
csv_input['Survived'] = Survived
csv_input.to_csv('y_XGB_svm_data.csv', index=False)

# **Good luck****