# Data Science Basics - Titanic Dataset - Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math

titanic_data = pd.read_csv("data/Titanic.csv")
titanic_data.head(10)

In [None]:
print("# of passengers in original data: " + str(len(titanic_data.index)))

## Analyzing Data

In [None]:
sns.countplot(x="Survived", data=titanic_data)

In [None]:
sns.countplot(x="Survived", hue="Sex", data=titanic_data)

In [None]:
sns.countplot(x="Survived", hue="Pclass", data=titanic_data)

In [None]:
titanic_data["Age"].plot.hist()

In [None]:
titanic_data["Fare"].plot.hist()

In [None]:
titanic_data["Fare"].plot.hist(bins=20, figsize=(10, 5))

In [None]:
titanic_data.info()

In [None]:
sns.countplot(x="SibSp", data=titanic_data)

In [None]:
sns.countplot(x="Parch", data=titanic_data)

## Data Wrangling

In [None]:
titanic_data.isnull()

In [None]:
titanic_data.isnull().sum()

In [None]:
sns.heatmap(titanic_data.isnull(), yticklabels=False)

In [None]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cmap="viridis")

In [None]:
sns.boxplot(x="Pclass", y="Age", data=titanic_data)

In [None]:
titanic_data.head(5)

In [None]:
titanic_data.drop("Cabin", axis=1, inplace=True)

In [None]:
titanic_data.head(5)

In [None]:
titanic_data.dropna(inplace=True)

In [None]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cmap="viridis")

In [None]:
titanic_data.isnull().sum()

## Convert String Values

In [None]:
titanic_data.head(2)

In [None]:
pd.get_dummies(titanic_data["Sex"])

## One column is enough to identify sex. So dropping another column.

In [None]:
pd.get_dummies(titanic_data["Sex"], drop_first=True)

In [None]:
sex = pd.get_dummies(titanic_data["Sex"], drop_first=True)
sex.head(5)

## Doing same with Embark now. Dropping noyt required column.

In [None]:
embark = pd.get_dummies(titanic_data["Embarked"])
embark.head(5)

In [None]:
embark = pd.get_dummies(titanic_data["Embarked"], drop_first=True)
embark.head(5)

## Doing for passanger class as well. Dropping unncessary values and columns for better analysis.

In [None]:
pclass = pd.get_dummies(titanic_data["Pclass"])
pclass.head(5)

In [None]:
pclass = pd.get_dummies(titanic_data["Pclass"], drop_first=True)
pclass.head(5)

## Concatenate the new data to original table

In [None]:
titanic_data = pd.concat([titanic_data, sex, embark, pclass], axis=1)

In [None]:
titanic_data.head(5)

## Dropping unnecessary columns. Like Sex, PassangerID, Embarked, etc

In [None]:
titanic_data.drop(
    ["PassengerId", "Sex", "Embarked", "Name", "Ticket"], axis=1, inplace=True
)

In [None]:
titanic_data.head()

## Train Data

### X is used to predict Survival based on all other values. Hence dropping the Survived and storing all values in x.
### Y is the outcome 

In [None]:
x = titanic_data.drop("Survived", axis=1)
y = titanic_data["Survived"]

In [None]:
from sklearn.model_selection import train_test_split

### Using above X and Y values to build a prediction model using logistic regression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression(solver="lbfgs", max_iter=400)

In [None]:
logmodel.fit(x_train, y_train)

In [None]:
prediction = logmodel.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
classification_report(y_test, prediction)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, prediction)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test, prediction)