In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import the Libraries 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Import the Dataset and Overview

In [None]:
train =  pd.read_csv('/kaggle/input/titanic/train.csv')
test  =  pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

# Data Preparation

In [None]:
men = train[train['Sex'] == 'male']['Survived']
rate_men = 100 * (sum(men)/len(men))
women = train.loc[train.Sex == 'female']["Survived"]
rate_women = 100 * (sum(women)/len(women))
print('The Percentage of Women who survived :',rate_women,'%')
print('The Percentage of Men who survived :',rate_men,'%')

In [None]:
100 * train.corr()['Survived'].sort_values()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = train.corr(),annot=True)
plt.show()

In [None]:
Pclass=['class1','class2','class3']
ax=sns.countplot(data=train,x='Pclass',hue='Survived')
plt.xticks(ticks = [0,1,2], labels = Pclass)
plt.legend(['Not Survived', 'Survived'])
plt.show()

In [None]:
def plot(df,col):
    plt.figure(figsize=(4,5))
    sns.countplot(data=df,x=col,hue='Survived')
    plt.legend(['Not Survived', 'Survived'])
    plt.show()
    
cols = ['Sex','SibSp','Parch','Embarked']
for x in cols:
    plot(train,x)

In [None]:
plt.figure(figsize=(4,5))
sns.countplot(x=train['Survived'],hue=pd.cut(train['Age'],5))

In [None]:
train.Fare

In [None]:
train['Fare_category'] = pd.cut(train['Fare'],bins=[0,7,14,31,120], labels=['Low','Mid','High_Mid','High'])
train

In [None]:
plt.figure(figsize=(4,5))
sns.countplot(data=train,x='Fare_category',hue='Survived')
plt.legend(['Not Survived', 'Survived'])
plt.show()

In [None]:
train["Age"] = train["Age"].fillna(-1)

In [None]:
dataset = [train,test]
for data in dataset:
    mean = data['Age'].mean()
    std = data['Age'].std()
    is_null = data['Age'].isnull().sum()
    #Compute random values between mean , std and nan values
    random_age = np.random.randint(mean-std,mean+std,size=is_null)
    #Fill nan values
    age_slice = data["Age"].copy()
    age_slice[np.isnan(age_slice)] = random_age
    data["Age"] = age_slice
    data["Age"] = data["Age"].astype(int)

In [None]:
dataset = [train,test]
embarked = train['Embarked'].mode()
for data in dataset:
    data['Embarked'] = data['Embarked'].fillna(embarked)

In [None]:
train.isnull().sum()

In [None]:
features = ['Pclass','Sex','SibSp','Parch']
X = pd.get_dummies(train[features])
y = train['Survived']

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Selection Model

In [None]:
def models(X_train,y_train):
    
    # LogisticRegression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression()
    log.fit(X_train,y_train)
    
    # DecisionTree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier()
    tree.fit(X_train,y_train)
    
    # Random Forest
    from sklearn.ensemble import RandomForestClassifier
    rnd = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
    rnd.fit(X_train,y_train)
    
    # Gradient Boost
    from sklearn.ensemble import GradientBoostingClassifier
    gb = GradientBoostingClassifier()
    gb.fit(X_train,y_train)
    
    # K-Neighbors
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=25)
    knn.fit(X_train,y_train)
 
    
    #Print the model accuracy of training data
    print('Logistic Regression Training Accuracy : ',log.score(X_train, y_train))
    print('Decision Tree Training Accuracy : ',tree.score(X_train, y_train))
    print('Random Forest Training Accuracy : ',rnd.score(X_train, y_train))
    print('Gradient Boosting Classifier Training Accuracy : ',gb.score(X_train, y_train))
    print('K-Neighbors Classifier Training Accuracy : ',knn.score(X_train, y_train))

    
    return log,tree,rnd,gb,knn

#Get the model
model = models(X_train,y_train)

    

In [None]:
#Test model accuracy on test data using confusion matrix and accuracy score
from sklearn.metrics import confusion_matrix , accuracy_score
for x in range(len(model)):
    print('Model :',model[x])
    cm = confusion_matrix(y_test,model[x].predict(X_test))
    print(cm)
    print('Accuracy ',accuracy_score(y_test,model[x].predict(X_test)))
    print('\n\n')

In [None]:
#Another way to get matrix of the models
from sklearn.metrics import classification_report
for i in range (len(model)):
    print('Model :',model[i])
    print(classification_report(y_test,model[i].predict(X_test)))
    print('\n\n')


In the end, we find that ``RandomForestClassifier`` is more accurate than the other

In [None]:
test.head()

In [None]:
id = test['PassengerId']

In [None]:
test.isnull().sum()

In [None]:
features = ['Pclass','Sex','SibSp','Parch']
X_test = pd.get_dummies(test[features])

In [None]:
model[2]

In [None]:
prediction = model[2].predict(X_test)

In [None]:
prediction

# Submission

In [None]:
submission = pd.DataFrame({'PassengerId':id,'Survived':prediction})
submission.to_csv('submission.csv', index=False)