In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from warnings import filterwarnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score,KFold
import plotly.express as px
from sklearn.svm import SVC
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
train_df=pd.read_csv("/kaggle/input/titanic/train.csv")
test_df=pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
#cabin has more null values. So let's drop cabin column
train_df=train_df.drop(columns=["Cabin"],axis=1)
train_df

In [None]:
#Also let's drop few more columns which doesn't have any impact in our prediction
train_df=train_df.drop(columns=["PassengerId","Name","Ticket"],axis=1)
train_df

In [None]:
#missing values
train_df.isnull().sum()

In [None]:
train_df.groupby("Pclass")["Age"].mean()

In [None]:
#Now, Let's fill all the missing values to proceed for our analysis
def Age(cols):
    Age=cols[0]
    Pclass=cols[1]
    if pd.isnull(Age):
        if Pclass==1:
            return 38.23
        elif Pclass==2:
            return 29.87
        elif Pclass==3:
            return 25.14
    else:
        return Age
    

In [None]:
train_df["Age"]=train_df[["Age","Pclass"]].apply(Age,axis=1)

In [None]:
train_df.isnull().sum()

In [None]:
#let's see the rows with missing values
train_df[train_df.isnull().any(axis=1)]

In [None]:
train_df.groupby("Pclass")["Embarked"].value_counts()

In [None]:
impute=SimpleImputer(strategy="most_frequent")
train_df[["Embarked"]]=impute.fit_transform(train_df[["Embarked"]])

In [None]:
x=["Pclass-1","Pclass-2","Pclass-3"]
y=[38.23,29.87,25.14]
plt.bar(x,y)
plt.xlabel("PCLASS")
plt.ylabel("AGE")
plt.title("Age in different classes")
plt.show()

In [None]:
plt.style.use("seaborn")
color=plt.cm.ocean(np.linspace(0,2,5))
train_df["Survived"].value_counts().plot.bar(color=color,figsize=(12,8))
plt.show()

In [None]:
train_df.groupby("Sex")["Survived"].value_counts()

In [None]:
plt.style.use("seaborn")
color=plt.cm.ocean(np.linspace(0,2,5))
train_df["Sex"].value_counts().plot.bar(color=color,figsize=(12,8))
plt.title("number of Male and Female")
plt.xlabel('SEX')
plt.ylabel("total count")
plt.show()

In [None]:
train_df["Embarked"].value_counts().plot.pie(figsize=(12,8),explode=(0.1,0.1,0.1),autopct="%1.2f%%")
plt.title("Embarked",fontsize=15)
plt.show()

In [None]:
train_df["Pclass"].value_counts().plot.pie(figsize=(12,8),explode=(0.1,0.1,0.1),autopct="%1.2f%%")
plt.title("Percentage of People in different class",fontsize=15)
plt.show()

In [None]:
s=train_df.groupby("Sex")["Survived"].value_counts()
color=plt.cm.ocean(np.linspace(0,2,5))
x=["Female","male"]
y=[s[0],s[3]]
plt.bar(x,y,color=color)
plt.xlabel('SEX')
plt.ylabel("Survived")
plt.show()

In [None]:
sns.heatmap(train_df.corr(),annot=True)
plt.show()

In [None]:
#To see how age is distributed
train_df["Age"].hist(figsize=(12,8),bins=15)
plt.show()

In [None]:
#To see how Fare is distributed
train_df["Fare"].hist(figsize=(12,8),bins=25)
plt.show()

In [None]:
e=train_df.groupby("Embarked")["Fare"].mean()
color=plt.cm.ocean(np.linspace(0,2,7))
x=["C_Embark","Q_Embark","S_Embark"]
y=[e[0],e[1],e[2]]
plt.bar(x,y,color=color)
plt.xlabel("Embarked",fontsize=(15))
plt.ylabel("Fare",fontsize=(15))
plt.show()

In [None]:
fig=px.sunburst(train_df.groupby(["Pclass","Sex","Survived"]).size().reset_index(name="count"),path=["Pclass","Sex","Survived"],values="count",title="Count of Survived Based on Pclass and Sex")
fig.show()

In [None]:
sex=pd.get_dummies(train_df["Sex"],drop_first=True)
Embark=pd.get_dummies(train_df["Embarked"],drop_first=True)

In [None]:
train_df=train_df.drop(columns=["Sex","Embarked"],axis=1)

In [None]:
train_df=pd.concat([train_df,sex,Embark],axis=1)

In [None]:
X=train_df.iloc[:,1:].values
y=train_df.iloc[:,0].values

In [None]:
sc=StandardScaler()
X=sc.fit_transform(X)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=100)

In [None]:
rfc=RandomForestClassifier(max_depth=10,random_state=1)

In [None]:
rfc.fit(X_train,y_train)
pred=rfc.predict(X_test)
pred_df=pd.DataFrame({"Actual":y_test,"pred":pred})
print(pred_df)

In [None]:
cm=confusion_matrix(y_test,pred)
cm

In [None]:
report=classification_report(y_test,pred)
print("RandomForestClassifier report \n",report)

In [None]:
ada=AdaBoostClassifier()
ada.fit(X_train,y_train)
pred=ada.predict(X_test)
pred_df=pd.DataFrame({"Actual":y_test,"pred":pred})
print(pred_df)

In [None]:
cm=confusion_matrix(y_test,pred)
cm

In [None]:
report=classification_report(y_test,pred)
print("AdaBoostClassifier report \n",report)

In [None]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
predict=classifier.predict(X_test)
pred_df=pd.DataFrame({"Actual":y_test,"pred":pred})
print(pred_df)

In [None]:
cm=confusion_matrix(y_test,pred)
cm

In [None]:
report=classification_report(y_test,pred)
print("KNeighborsClassifier report \n",report)

In [None]:
classifier = SVC()
classifier.fit(X_train, y_train)
predict=classifier.predict(X_test)
pred_df=pd.DataFrame({"Actual":y_test,"pred":pred})
print(pred_df)

In [None]:
cm=confusion_matrix(y_test,pred)
cm

In [None]:
report=classification_report(y_test,pred)
print("SVC report \n",report)

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predict=classifier.predict(X_test)
pred_df=pd.DataFrame({"Actual":y_test,"pred":pred})
print(pred_df)

In [None]:
cm=confusion_matrix(y_test,pred)
cm

In [None]:
report=classification_report(y_test,pred)
print("Logistic Regression \n",report)

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
y_model = model.predict(X_test) 
pred_df=pd.DataFrame({"Actual":y_test,"pred":pred})
print(pred_df)

In [None]:
cm=confusion_matrix(y_test,pred)
cm

In [None]:
report=classification_report(y_test,pred)
print("GaussianNB \n",report)

In [None]:
test=test_df.drop(columns=["PassengerId","Name","Ticket","Cabin"],axis=1)

In [None]:
test

In [None]:
def Age(cols):
    Age=cols[0]
    Pclass=cols[1]
    if pd.isnull(Age):
        if Pclass==1:
            return 38.23
        elif Pclass==2:
            return 29.87
        elif Pclass==3:
            return 25.14
    else:
        return Age

In [None]:
test["Age"]=test[["Age","Pclass"]].apply(Age,axis=1)

In [None]:
sex=pd.get_dummies(test["Sex"],drop_first=True)
Embark=pd.get_dummies(test["Embarked"],drop_first=True)

In [None]:
test=test.drop(columns=["Sex","Embarked"],axis=1)

In [None]:
test=pd.concat([test,sex,Embark],axis=1)

In [None]:
test.isnull().sum()

In [None]:
test["Fare"]=test["Fare"].fillna(test["Fare"].mean())

In [None]:
test=sc.fit_transform(test)

In [None]:
y_pred=rfc.predict(test)

In [None]:
y_pred

In [None]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved.")

In [None]:
print(output.head())

# **If you like my notebook. Don't forget to upvote it.**