In [206]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [207]:
# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = pd.concat([train, test], ignore_index=True)
test_passenger_ids = test.PassengerId

In [208]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [209]:
#structural checks
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [210]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [211]:
train.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,418
Pclass,0
Name,0
Sex,0
Age,263
SibSp,0
Parch,0
Ticket,0
Fare,1


In [212]:
train.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,418
Pclass,0
Name,0
Sex,0
Age,263
SibSp,0
Parch,0
Ticket,0
Fare,1


In [213]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [214]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [215]:
#Handle missing values
# Fill Embarked with mode
# Fill Age using median / grouped median

In [216]:
train['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,914
C,270
Q,123


In [217]:
#Since only 2 values are missing, we fill them with the mode.
train['Embarked'].fillna('S', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna('S', inplace=True)


In [218]:
train['Embarked'].isnull().sum()

np.int64(0)

In [219]:
# fill Age smartly using:

# Median

# Or grouped median (by Sex & Pclass)

In [220]:
train.groupby(['Sex', 'Pclass'])['Age'].median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Sex,Pclass,Unnamed: 2_level_1
female,1,36.0
female,2,28.0
female,3,22.0
male,1,42.0
male,2,29.5
male,3,25.0


In [221]:
def fill_age(row):
  if pd.isnull(row.Age):
    return train.groupby(['Sex', 'Pclass']).Age.median()[row.Sex, row.Pclass]
  else:
    return row.Age

In [222]:
train['Age'] = train.apply(fill_age, axis=1)

In [223]:
train.Age.isnull().sum()

np.int64(0)

In [224]:
# Feature Engineering

# We will now:

# Extract Title from Name

# Create FamilySize

# Drop unnecessary columns

# Encode categorical features

In [225]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

  train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [226]:
train['Title'].value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Mr,757
Miss,260
Mrs,197
Master,61
Rev,8
Dr,8
Col,4
Major,2
Mlle,2
Ms,2


OBSERVATION

Common titles → Mr, Miss, Mrs, Master

Rare titles → Dr, Rev, Col, Major, Lady, Sir, etc. (very few rows)

combine rare titles into a single category: Rare

Why this is done

Reduces noise

Improves model stability

Prevents overfitting

In [227]:
rare_list = train['Title'].unique()[4:]
print(rare_list)

['Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady' 'Sir' 'Mlle' 'Col' 'Capt'
 'Countess' 'Jonkheer' 'Dona']


In [228]:
train['Title'].replace(rare_list, 'Rare', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Title'].replace(rare_list, 'Rare', inplace=True)


In [229]:
train.Title.value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Mr,757
Miss,260
Mrs,197
Master,61
Rare,34


In [230]:
# Encode Title
title_map = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
train.Title = train.Title.map(title_map)

In [231]:
train.drop('Name', axis=1, inplace=True)

In [232]:
train.Title.value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
1,757
2,260
3,197
4,61
5,34


In [233]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0.0,3,male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,female,38.0,1,0,PC 17599,71.2833,C85,C,3


In [234]:
# Passengers with family had higher survival chances
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

In [235]:
train.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [236]:
train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize
0,0.0,3,male,22.0,1,0,7.25,S,1,2
1,1.0,1,female,38.0,1,0,71.2833,C,3,2


In [237]:
# Encode categorical columns
sex_map = {'male':0, 'female':1}
embarked_map = {'S':0, 'C':1, 'Q':2}
train.Sex = train.Sex.map(sex_map)
train.Embarked = train.Embarked.map(embarked_map)

In [238]:
train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize
0,0.0,3,0,22.0,1,0,7.25,0,1,2
1,1.0,1,1,38.0,1,0,71.2833,1,3,2


In [239]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    float64
 1   Pclass      1309 non-null   int64  
 2   Sex         1309 non-null   int64  
 3   Age         1309 non-null   float64
 4   SibSp       1309 non-null   int64  
 5   Parch       1309 non-null   int64  
 6   Fare        1308 non-null   float64
 7   Embarked    1309 non-null   int64  
 8   Title       1309 non-null   int64  
 9   FamilySize  1309 non-null   int64  
dtypes: float64(3), int64(7)
memory usage: 102.4 KB


In [240]:
test = train[train.Survived.isnull()]
train = train[train.Survived.notnull()]

In [241]:
print(test.shape)
print(train.shape)

(418, 10)
(891, 10)


successfully done:

Data cleaning

Smart missing value handling

Feature engineering

Encoding

Next step = build the first ML model (Logistic Regression)

In [242]:
# Step 1: Split features & target
x = train.drop('Survived', axis=1)
y = train.Survived

In [243]:
# Step 11️⃣ Train–test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [244]:
# Step 12️⃣ Train first model (Logistic Regression)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)


In [245]:
# Step 13️⃣ Evaluate model
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_test)
accuracy_score(y_test, y_pred)

0.7821229050279329

In [246]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[86, 19],
       [20, 54]])

In [247]:
test['PassengerId'] = test_passenger_ids

In [248]:
X_test = test.drop(['PassengerId', 'Survived'], axis=1, errors='ignore')

In [250]:
X_test.isnull().sum()

Unnamed: 0,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Fare,1
Embarked,0
Title,0
FamilySize,0


In [253]:
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)


In [254]:
test_predictions = model.predict(X_test)


In [255]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions
})

submission.to_csv('submission.csv', index=False)