In [None]:
import numpy as np   # linear algebra
import pandas as pd  # data processing
import seaborn as sns   # data visualization
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets.samples_generator import make_blobs
import argparse

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test=pd.read_csv('/kaggle/input/titanic/test.csv')
dTrain=pd.read_csv('/kaggle/input/titanic/train.csv')
dTrain.columns

In [None]:
dTrain.drop(columns=["Name","Ticket","Cabin"], inplace=True)
l=dTrain.head(15)
l

In [None]:
l=dTrain.describe()
l

In [None]:
sns.countplot(x='Survived', hue='Pclass', data=dTrain)

In [None]:
sns.countplot(x='Survived', hue='Sex', data=dTrain)

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x='Pclass',y='Age',data=dTrain)

In [None]:
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))
women = dTrain[dTrain['Sex']=='female']
men = dTrain[dTrain['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
ax.set_title('Male')

In [None]:
sns.heatmap(dTrain.isnull(), yticklabels = False, cmap="YlGnBu")

## Treatment for missing value

In [None]:
dTrain.count()

In [None]:
total = dTrain.isnull().sum().sort_values(ascending=False)
percent_1 = dTrain.isnull().sum()/dTrain.isnull().count()*100
percent_2 = (round(percent_1,1)).sort_values(ascending=False)
missing_data = pd.concat([total,percent_2],axis=1,keys=['Total','%'])
missing_data

In [None]:
sns.heatmap(dTrain.isnull(), yticklabels = False, cmap="YlGnBu") # heat map for null values

In [None]:
dTrain["Age"][dTrain["Age"].isna()] = dTrain["Age"].mean()
sns.heatmap(dTrain.isnull(), yticklabels = False, cmap="YlGnBu") # heat map for null values

## Replace by Dummy variable

In [None]:
# pd.get_dummies(dTrain["Sex"])
male = pd.get_dummies(dTrain["Sex"],drop_first=True)

In [None]:
embarked = pd.get_dummies(dTrain["Embarked"],drop_first=True)
pclass= pd.get_dummies(dTrain["Pclass"],drop_first=True)

In [None]:
dTrain= pd.concat([dTrain,pclass,male,embarked],axis=1)
dTrain.head()

In [None]:
X=dTrain.drop(["Survived","Pclass","Sex","Embarked"],axis=1)
y = dTrain["Survived"]
X.head(10)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

In [None]:
# Logistic Regression
logreg = LogisticRegression(tol=0.0001,solver='liblinear',max_iter=100)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

In [None]:
# K Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

In [None]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)  
Y_pred = gaussian.predict(X_test) 
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

In [None]:
# Perceptron
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)

In [None]:
# Linear Support Vector Machine
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

In [None]:
# Decision Tree
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train) 
Y_pred = decision_tree.predict(X_test) 
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

In [None]:
# Which is the best Model ?
results = pd.DataFrame({'Model':['Support Vector Machines','KNN', 'Logistic Regression','Random Forest','Naive Bayes','Perceptron', 
              'Decision Tree'],'Score': [acc_linear_svc, acc_knn, acc_log,acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

In [None]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

In [None]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100, oob_score = True)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

In [None]:
print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

## submission

In [None]:
X.head(5)

In [None]:
test.head(5)

In [None]:
test.drop(["Name","Ticket","Cabin"],axis=1,inplace=True)
test.head(5)

## Do  for test data set

In [None]:
male= pd.get_dummies(test["Sex"],drop_first=True)
embarked = pd.get_dummies(test["Embarked"],drop_first=True)
pclass= pd.get_dummies(test["Pclass"],drop_first=True)
xtest= pd.concat([test,pclass,male,embarked],axis=1)
xtest.head(5)

In [None]:
xtest=xtest.drop(["Pclass","Sex","Embarked"],axis=1)

In [None]:
xtest["Age"][xtest["Age"].isna()] = xtest["Age"].mean()
xtest["Fare"][xtest["Fare"].isna()] = xtest["Fare"].mean()
sns.heatmap(xtest.isnull(), yticklabels = False, cmap="YlGnBu") 

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
prediction = random_forest.predict(xtest)

In [None]:
prediction[:10]

In [None]:
# check structure with original submission 
g_submission=pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
g_submission.head(10)

In [None]:
# make submission df as given gender_sub.csv
submission=pd.DataFrame()
submission['PassengerId']=test['PassengerId']
submission['Survived']=prediction
submission.head(10)

In [None]:
model_score = random_forest.score(X_test, Y_test)
print(model_score)

In [None]:
# Are our test and submission dataframe same length
if len(submission) == len(xtest):
    print("Same length",len(submission))
else:
    print("not match")

In [None]:
submission.to_csv("randomForest_submission.csv",index=False)

In [None]:
#submission_check=pd.read_csv("randomForest_submission.csv")
#submission_check.head(5)