# Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# loading Datasets

In [None]:
titanic_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
titanic_df.head()

In [None]:
#Get a basic description of the dataset
titanic_df.info()

As observed, Age, Cabin and Embarked all have missing values. We will have to clean our data before feeding it into our prediciton model

# EDA

# Is sex a survival factor?

In [None]:
# population of males to that of females.
sns.countplot('Sex',data=titanic_df)

In [None]:
# As seen, the number of males is almost twice that of females

# What are the chances of a male or female surviving?

In [None]:
sns.countplot('Sex', hue='Survived', data=titanic_df)

#Insight - More females survived than males. Hence, sex is a survival factor.

In [None]:
#chances of survival due to having more than one sibling or spouse
sns.catplot(x='SibSp', y='Survived', kind ='bar', data=titanic_df)

#Insight - passangers with one or two siblings have more chances of survival 

# Is Class a survival factor?

In [None]:
#No of males and females traveling in PClass
sns.countplot('Sex', hue='Pclass', data=titanic_df, palette='cool')

#We have more males traveling in Pclass than females

In [None]:
sns.catplot(x = "Pclass", y="Survived", data = titanic_df, kind="bar")

The higher the class, the more the chances of survival

# Cleaning our data

In [None]:
#check for null values
titanic_df.isna().sum()

Cabin and Age has missing values. We can fill the Age missing values by mean age and drop the cabin column

In [None]:
titanic_df.drop(['Cabin'], axis=1, inplace=True)

In [None]:
#Filling Age with mean values
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)

In [None]:
titanic_df['Embarked'].fillna('S', inplace=True)

In [None]:
titanic_df.isna().sum()

# Dropping unnecessary columns

In [None]:
titanic_df.drop(["PassengerId", "Ticket", "SibSp", "Parch", "Name"], axis=1, inplace=True)

In [None]:
titanic_df.head()

# Convertibg categorical data to Numeric using One Hot Ending 

In [None]:
label=LabelEncoder()
titanic_df['Embarked']=label.fit_transform(titanic_df['Embarked'])

titanic_df['Sex']=label.fit_transform(titanic_df['Sex'])

In [None]:
titanic_df.head()

# Model Building

In [None]:
X=titanic_df.drop('Survived',axis=1)

#Target
y=titanic_df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 5)

In [None]:
model = RandomForestClassifier(n_estimators=200, random_state=2)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
model_accuracy = accuracy_score(y_test, y_predict) * 100
print("accuracy=",model_accuracy)

we got an accuracy score of 82%. Great

# Preparing the test data for prediction

In [None]:
test.head()

In [None]:
test.isna().sum()

In [None]:
#dropping columns
test.drop(["PassengerId", "Cabin", "Ticket", "SibSp", "Parch", "Name"], axis=1, inplace=True)

In [None]:
#filling missing values
test['Age'].fillna(test['Age'].mean(), inplace=True)

In [None]:
test['Embarked'].fillna('S', inplace=True)

test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [None]:
#One hot Encoding
test['Embarked']=label.fit_transform(test['Embarked'])

test['Sex']=label.fit_transform(test['Sex'])

In [None]:
test.isna().sum()

In [None]:
x_test = test
y_pred = model.predict(x_test)

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": y_pred
    })
submission.head()