<a href="https://colab.research.google.com/github/rawbil/models_V1/blob/main/titanic_model/csvs/model1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Survival in the Titanic

In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore') # Ignore warnings and focus on main output

In [71]:
df = pd.read_csv('Titanic-Dataset.csv')
df.info()
df
# NULL ROWS
"""
- Age
- Cabin
- Embarked
"""

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


'\n- Age\n- Cabin\n- Embarked\n'

In [72]:
# Check for empty rows
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [73]:
# Remove NULL age and Embarked rows
df.dropna(subset=["Age", "Embarked"], inplace=True)

# Replace NULL Cabin values with "Unknown"
df.fillna({"Cabin": "Unknown"}, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        712 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.3+ KB


In [74]:
# Look for duplicate values
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.shape
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,Unknown,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [75]:
# Re-arrange Columns
new_cols = ['PassengerId', "Name", "Ticket", "Pclass", "Cabin", "Embarked", "SibSp", "Parch", "Age", "Fare", "Sex", "Survived"]
df = df[new_cols]
df

Unnamed: 0,PassengerId,Name,Ticket,Pclass,Cabin,Embarked,SibSp,Parch,Age,Fare,Sex,Survived
0,1,"Braund, Mr. Owen Harris",A/5 21171,3,Unknown,S,1,0,22.0,7.2500,male,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,1,C85,C,1,0,38.0,71.2833,female,1
2,3,"Heikkinen, Miss. Laina",STON/O2. 3101282,3,Unknown,S,0,0,26.0,7.9250,female,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,1,C123,S,1,0,35.0,53.1000,female,1
4,5,"Allen, Mr. William Henry",373450,3,Unknown,S,0,0,35.0,8.0500,male,0
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,"Rice, Mrs. William (Margaret Norton)",382652,3,Unknown,Q,0,5,39.0,29.1250,female,0
886,887,"Montvila, Rev. Juozas",211536,2,Unknown,S,0,0,27.0,13.0000,male,0
887,888,"Graham, Miss. Margaret Edith",112053,1,B42,S,0,0,19.0,30.0000,female,1
889,890,"Behr, Mr. Karl Howell",111369,1,C148,C,0,0,26.0,30.0000,male,1


In [78]:
# Convert Sex Col to Integer
df["Sex"] = pd.get_dummies(df["Sex"], drop_first=True, dtype=int)
df
# 1 - Male
# 0 - Female

Unnamed: 0,PassengerId,Name,Ticket,Pclass,Cabin,Embarked,SibSp,Parch,Age,Fare,Sex,Survived
0,1,"Braund, Mr. Owen Harris",A/5 21171,3,Unknown,S,1,0,22.0,7.2500,1,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,1,C85,C,1,0,38.0,71.2833,0,1
2,3,"Heikkinen, Miss. Laina",STON/O2. 3101282,3,Unknown,S,0,0,26.0,7.9250,0,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,1,C123,S,1,0,35.0,53.1000,0,1
4,5,"Allen, Mr. William Henry",373450,3,Unknown,S,0,0,35.0,8.0500,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,"Rice, Mrs. William (Margaret Norton)",382652,3,Unknown,Q,0,5,39.0,29.1250,0,0
886,887,"Montvila, Rev. Juozas",211536,2,Unknown,S,0,0,27.0,13.0000,1,0
887,888,"Graham, Miss. Margaret Edith",112053,1,B42,S,0,0,19.0,30.0000,0,1
889,890,"Behr, Mr. Karl Howell",111369,1,C148,C,0,0,26.0,30.0000,1,1
