In [2]:
!pip install kaggle



In [9]:
!chmod 600 ~/.kaggle/kaggle.json

In [10]:
import kaggle

In [17]:
kaggle.api.dataset_download_files('ashishkumarjayswal/titanic-datasets', path='.', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/ashishkumarjayswal/titanic-datasets


In [13]:
import pandas as pd

In [18]:
df = pd.read_csv('/content/titanic.csv')

In [19]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [24]:
df.shape

(418, 12)

In [20]:
df.duplicated().sum()

0

In [21]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [27]:
mean_age = df['Age'].mean()

In [28]:
print(mean_age)

30.272590361445783


In [29]:
df['Age'].fillna(mean_age, inplace=True)

In [30]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [31]:
df.drop(columns=['Cabin', 'Ticket'], inplace=True)

In [33]:
from sklearn.impute import SimpleImputer

In [34]:
imputer = SimpleImputer(strategy='mean')

In [35]:
df['Fare'] = imputer.fit_transform(df[['Fare']])

I decided to fill in the age column with mean, cause 86 rows are quite important for 418 rows dataframe. I've dropped the columns Cabin and ticket as unmeaningful. It was also possible to drop the row with fare value missing, but I wanted to use SimpleImputer

In [36]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [37]:
df['FamSize'] = df['Parch'] + df['SibSp'] + 1

In [38]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamSize
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,2
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3


In [41]:
import re

In [39]:
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

In [42]:
df['Title'] = df['Name'].apply(extract_title)

In [43]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamSize,Title
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1,Mr
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,2,Mrs
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1,Mr
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1,Mr
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3,Mrs


In [44]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [45]:
categorical_features = ['Sex', 'Embarked']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [47]:
numerical_features = ['Age', 'Fare']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [48]:
import numpy as np
from scipy import stats

In [49]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]


In [50]:
outliers_fare_iqr = detect_outliers_iqr(df, 'Fare')
outliers_age_iqr = detect_outliers_iqr(df, 'Age')

In [51]:
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

In [52]:
df = cap_outliers(df, 'Fare')
df = cap_outliers(df, 'Age')

In [54]:
numerical_features = ['Age', 'Fare', 'FamSize']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [55]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,FamSize,Title,Sex_male,Embarked_Q,Embarked_S
0,892,0,3,"Kelly, Mr. James",0.382366,0,0,-0.79647,-0.553443,Mr,True,True,False
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1.451281,1,0,-0.835607,0.105643,Mrs,False,False,True
2,894,0,2,"Myles, Mr. Thomas Francis",2.124697,0,0,-0.70876,-0.553443,Mr,True,True,False
3,895,0,3,"Wirz, Mr. Albert",-0.258983,0,0,-0.757139,-0.553443,Mr,True,False,True
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",-0.686549,1,1,-0.586042,0.764728,Mrs,False,False,True
