In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Description of Data**
The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.

One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.

In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.

In [None]:
#importing pakages
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#importing dataset
dataset = pd.read_csv('/kaggle/input/titanic/train.csv')
dataset.head()

In [None]:
Dataset = "titanic"

In [None]:
dataset.shape

In [None]:
#check for missing values
sns.heatmap(dataset.isnull())

In [None]:
#to check number of people that survived the disaster and number of people who did not survive.
sns.countplot(x='Survived',data=dataset)

In [None]:
#number of people that survived based on their age.
sns.countplot(x='Survived',data=dataset,hue="Sex")

In [None]:
#number of people that survived based on their Pclass.
sns.countplot(x='Survived',data=dataset,hue="Pclass")

In [None]:
plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=dataset,palette='winter')

In [None]:
#missing value imputation on age and Pclass
def impute_age(cols):
  Age = cols[0]
  Pclass = cols[1]
  
  if pd.isnull(Age):
    if Pclass == 1:
      return 37
    elif Pclass ==2:
      return 29
    else:
      return 24
  else:
    return Age

In [None]:
dataset['Age']= dataset[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
#dropping independent variable cabin
dataset.drop('Cabin',axis =1,inplace=True)

In [None]:
dataset.groupby('Embarked').size()

In [None]:
#imputing common value for embarked
common_value = 'S'
dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
dataset.info()

In [None]:
dataset.info()

In [None]:
dataset.head()

In [None]:
sex = pd.get_dummies(dataset['Sex'],drop_first=True)
embark = pd.get_dummies(dataset['Embarked'],drop_first=True)

In [None]:
dataset.drop(['PassengerId','Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
dataset = pd.concat([dataset,sex,embark],axis=1)

In [None]:
#Creating training data set
X_train = dataset.drop(['Survived'],axis =1 )
y_train = dataset['Survived']

In [None]:
#USING RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
y_pred_rf=rf.predict(X_train)

In [None]:
from sklearn import metrics
print('Accuracy',metrics.accuracy_score(y_train,y_pred_rf))

**Test Data**

In [None]:
#importing training data
data = pd.read_csv('/kaggle/input/titanic/test.csv')
data.shape

In [None]:
passengerID = data['PassengerId']

In [None]:
data.info()

In [None]:
sns.heatmap(data.isnull())

In [None]:
#missing value imputation on age and Pclass
def impute_age(cols):
  Age = cols[0]
  Pclass = cols[1]
  
  if pd.isnull(Age):
    if Pclass == 1:
      return 37
    elif Pclass ==2:
      return 29
    else:
      return 24
  else:
    return Age

In [None]:
data['Age']= data[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(data.isnull())

In [None]:
#dropping cabin from dataset
data.drop('Cabin',axis =1,inplace=True)

In [None]:
data['Age'] = data[['Age','Pclass']].apply(impute_age,axis=1)
data.fillna(method='ffill',inplace=True)

In [None]:
sex = pd.get_dummies(data['Sex'],drop_first=True)
embark = pd.get_dummies(data['Embarked'],drop_first=True)

In [None]:
data.drop(['PassengerId','Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
data = pd.concat([data,sex,embark],axis=1)

In [None]:
data.head()

In [None]:
#predicting the people who survived on test data
y_pred_test = rf.predict(data)

In [None]:
y_pred_test

In [None]:
df = pd.DataFrame({'PassengerID':passengerID, 'Survived':y_pred_test})
df