## **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## **Loading Dataset**

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')

### Checking for NAN values using seaborn's heatmap

In [None]:
sns.heatmap(df_train.isnull(),yticklabels=False,cmap='viridis')

## Handling missing data
Note - it can be seen that Cabin column has most of cell values NaN, so we can drop that coloumn. While age column NaN values can be handled by replacing it with average values taking into consideration values of PClass.

In [None]:
df_train.drop(['Cabin'],inplace=True,axis=1)
df_train

### Plotting Boxplot of Age coloumn with Pclass

In [None]:
sns.boxplot(y='Age',x='Pclass',data=df_train)

## **Replacing NaN values for Age column based on average values observed from boxplot w.r.t Pclass**

In [None]:
df_train.at[(df_train['Age'].isnull()) & (df_train['Pclass']==1),'Age'] = 37
df_train.at[(df_train['Age'].isnull()) & (df_train['Pclass']==2),'Age'] = 28
df_train.at[(df_train['Age'].isnull()) & (df_train['Pclass']==3),'Age'] = 23
df_train

## Replacing NaN value of Fare column with average of Fare field data

In [None]:
df_train.at[df_train['Fare'].isnull(),'Fare'] = np.mean(df_train['Fare'])

## Final Checking for NaN values in dataset

In [None]:
sns.heatmap(df_train.isnull(),yticklabels=False,cmap='viridis')

## **Handling Categorical Features - Sex, Embarked with get_dummies()**

Sex Categorical Feature Handling

In [None]:
pd.get_dummies(df['Sex'],drop_first=True).head()

In [None]:
if any(df['Sex']==42):
    print(True)

In [None]:
sex = pd.get_dummies(df_train['Sex'],drop_first=True)
sex

Embarked Categorical feature handling

In [None]:
embark = pd.get_dummies(df_train['Embarked'],drop_first=True)
embark

In [None]:
df_train.drop(['Sex','Embarked','Name','PassengerId','Ticket'],axis=1,inplace=True)

In [None]:
df_train

In [None]:
df_train = pd.concat([df_train,sex,embark],axis=1)
df_train

# **Preparing Testing Data**

In [None]:
df_test = pd.read_csv('../input/titanic/test.csv')

### Checking for NAN values using seaborn's heatmap

In [None]:
sns.heatmap(df_test.isnull(),yticklabels=False,cmap='viridis')

## Handling missing data
Note - it can be seen that Cabin column has most of cell values NaN, so we can drop that coloumn. While age column NaN values can be handled by replacing it with average values taking into consideration values of PClass.

In [None]:
df_test.drop(['Cabin'],inplace=True,axis=1)
df_test

In [None]:
sns.boxplot(y='Age',x='Pclass',data=df_test)

In [None]:
df_test.at[(df_test['Age'].isnull()) & (df_test['Pclass']==1),'Age'] = 42
df_test.at[(df_test['Age'].isnull()) & (df_test['Pclass']==2),'Age'] = 25
df_test.at[(df_test['Age'].isnull()) & (df_test['Pclass']==3),'Age'] = 22
df_test

In [None]:
df_test.at[df_test['Fare'].isnull(),'Fare'] = np.mean(df_test['Fare'])

In [None]:
sns.heatmap(df_test.isnull(),yticklabels=False,cmap='viridis')

In [None]:
pd.get_dummies(df_test['Sex'],drop_first=True).head()
sex = pd.get_dummies(df_test['Sex'],drop_first=True)
embark = pd.get_dummies(df_test['Embarked'],drop_first=True)
df_test.drop(['Sex','Embarked','Name','PassengerId','Ticket'],axis=1,inplace=True)
df_test = pd.concat([df_test,sex,embark],axis=1)

In [None]:
df_test

In [None]:
df = pd.read_csv('../input/test-file/tested.csv')

# **Building Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=2000)
lr.fit(df_train.drop(['Survived'],axis=1),df_train['Survived'])

## Testing 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

y_pred = lr.predict(df_test)
cm = confusion_matrix(df['Survived'],y_pred)
print(cm)

accuracy = accuracy_score(df['Survived'],y_pred)
print(accuracy)


df_ans  = pd.DataFrame(columns=['PassengerId','Survived'])
df_ans['PassengerId'] = df['PassengerId']
df_ans['Survived'] = y_pred

In [None]:
df_ans.to_csv('./titanic_test.csv',index=False)