#### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns,set()

#### Importing Dataset

In [None]:
train_data=pd.read_csv("../input/titanic/train.csv")
test_data=pd.read_csv("../input/titanic/test.csv")

### Performing Exploratory Data Analysis

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.columns

In [None]:
train_data.info()

#### Checking various null entries in the dataset, with the help of heatmap

In [None]:
train_data.isnull().sum()

#### Visualization of various relationships between variables

In [None]:
sns.countplot(x='Survived', data=train_data)

In [None]:
sns.countplot(x='Survived',hue='Sex',data=train_data)

In [None]:
sns.catplot(x='Pclass',hue='Survived',kind='count',data=train_data)

In [None]:
plt.figure(figsize=(10,10))
plt.subplot(3,2,6)
sns.boxplot(x='Pclass',y='Age',data=train_data)

#### Replacing null values in Age column using function


In [None]:
def add_age(cols):
    Age=cols[0]
    Pclass=cols[1]
    if pd.isnull(Age):
        if Pclass==1:
            return train_data[train_data['Pclass']==1]['Age'].median()
        elif Pclass==2:
            return train_data[train_data['Pclass']==2]['Age'].median()
        elif Pclass==3:
            return train_data[train_data['Pclass']==3]['Age'].median()
    else:
        return Age
        

In [None]:
train_data['Age']=train_data[['Age','Pclass']].apply(add_age,axis=1)

#### Convert sex and embarked columns to numerical values

In [None]:
train_data.Sex=train_data.Sex.map({'female':0, 'male':1})
train_data.Embarked=train_data.Embarked.map({'S':0, 'C':1, 'Q':2, 'nan':'NaN'})

#### Dropping Null Data

In [None]:
train_data.drop('Cabin',axis=1,inplace=True)

Removing rows with null values

In [None]:
train_data.dropna(inplace=True)

In [None]:
train_data.drop(['Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)

#### Feature engineering

In [None]:
min_age=min(train_data.Age)
max_age=max(train_data.Age)
min_fare=min(train_data.Fare)
max_fare=max(train_data.Fare)

In [None]:
train_data.Age = (train_data.Age-min_age)/(max_age-min_age)
train_data.Fare = (train_data.Fare-min_fare)/(max_fare-min_fare)

#### Print the finalised data

In [None]:
train_data.head()

#### Split the data set into x and y data

In [None]:
x_data=train_data.drop('Survived',axis=1)
y_data=train_data['Survived']

#### Split the data set into training data and test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.2, random_state=0, stratify=y_data)

#### Create the model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

#### Train the model and create predictions

In [None]:
model.fit(x_training_data, y_training_data)
predictions = model.predict(x_test_data)

#### Let’s see how accurate is our model for predictions:

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test_data, predictions))

In [None]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test_data, predictions))

#### Let’s see the confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_data, predictions)


### confusion matrix using seaborn

In [None]:
cf_matrix=confusion_matrix(y_test_data, predictions)


In [None]:
import seaborn as sns

sns.heatmap(cf_matrix, annot=True)

#### Cleaning test datset

In [None]:
test_data.head()

In [None]:
test_data.isnull().sum()

In [None]:
test_data['Age']=test_data[['Age','Pclass']].apply(add_age,axis=1)

In [None]:
test_data['Fare']=test_data['Fare'].fillna(test_data['Fare'].median())

In [None]:
test_data.Sex=test_data.Sex.map({'female':0, 'male':1})
test_data.Embarked=test_data.Embarked.map({'S':0, 'C':1, 'Q':2, 'nan':'NaN'})

In [None]:
min_age1=min(test_data.Age)
max_age1=max(test_data.Age)
min_fare1=min(test_data.Fare)
max_fare1=max(test_data.Fare)

In [None]:
test_data.Age = (test_data.Age-min_age1)/(max_age1-min_age1)
test_data.Fare = (test_data.Fare-min_fare1)/(max_fare1-min_fare1)

In [None]:
test_data.drop(['Cabin','PassengerId','Name','Ticket'],axis=1,inplace=True)

In [None]:
test_data.isnull().sum()

In [None]:
test_data.head()

#### Prediction

In [None]:
prediction=model.predict(test_data)

In [None]:
test=pd.read_csv("../input/titanic/test.csv")

In [None]:
prediction

In [None]:
submission = pd.DataFrame({"PassengerId": test["PassengerId"],"Survived": prediction})
submission.to_csv('submission.csv', index=False)