In [2]:
!mkdir -p /root/.kaggle

In [5]:
import shutil
shutil.move('kaggle.json', '/root/.kaggle/')

'/root/.kaggle/kaggle.json'

In [6]:
!chmod 600 /root/.kaggle/kaggle.json

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
!kaggle datasets download -d yasserh/titanic-dataset

Dataset URL: https://www.kaggle.com/datasets/yasserh/titanic-dataset
License(s): CC0-1.0
Downloading titanic-dataset.zip to /content
  0% 0.00/22.0k [00:00<?, ?B/s]
100% 22.0k/22.0k [00:00<00:00, 96.7MB/s]


In [9]:
!unzip /content/titanic-dataset.zip

Archive:  /content/titanic-dataset.zip
  inflating: Titanic-Dataset.csv     


In [10]:
df = pd.read_csv('/content/Titanic-Dataset.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [11]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [12]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [15]:
if df[['Survived', 'Age']].isnull().sum().sum() > 0:
    df.dropna(subset=['Survived', 'Age'], inplace=True)

In [16]:
X = df.drop('Survived', axis=1)
y = df['Survived'].copy()

In [17]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.2500
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.9250
3,1,female,35.0,1,0,53.1000
4,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...
885,3,female,39.0,0,5,29.1250
886,2,male,27.0,0,0,13.0000
887,1,female,19.0,0,0,30.0000
889,1,male,26.0,0,0,30.0000


In [18]:
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
885,0
886,0
887,1
889,1


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [21]:
num_features = X_train.select_dtypes(include=[np.number]).columns
cat_features = X_train.select_dtypes(exclude=[np.number]).columns

In [22]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('object', cat_pipeline, cat_features)
], remainder='passthrough')

In [23]:
transformer.fit(X_train)

In [24]:
X_train_transformer_df = pd.DataFrame(data=transformer.transform(X_train), columns=transformer.get_feature_names_out())
X_test_transformer_df = pd.DataFrame(data=transformer.transform(X_test), columns=transformer.get_feature_names_out())

In [25]:
rfc = RandomForestClassifier(n_estimators=1)

In [26]:
rfc.fit(X_train_transformer_df, y_train)

In [27]:
rfc.score(X_train_transformer_df, y_train), rfc.score(X_test_transformer_df, y_test)

(0.8949211908931699, 0.8181818181818182)

GridSearchCV & RandomizedSearchCV

In [28]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [29]:
%%time
param_grid = {
    'n_estimators': [1, 100, 150],
    'max_depth': [50, 100, 150],
    'criterion': ['gini', 'entropy', 'log_loss']
}

grid_scv = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3)
grid_scv.fit(X_train_transformer_df, y_train)

CPU times: user 13.7 s, sys: 23.4 ms, total: 13.7 s
Wall time: 13.8 s


In [30]:
grid_scv.best_params_

{'criterion': 'gini', 'max_depth': 100, 'n_estimators': 100}

In [31]:
best_estimator = grid_scv.best_estimator_
best_estimator.fit(X_train_transformer_df, y_train)

In [32]:
best_estimator.score(X_train_transformer_df, y_train), best_estimator.score(X_test_transformer_df, y_test)

(0.9877408056042032, 0.8391608391608392)

# Grid Search with Full Pipeline

In [33]:
full_pipeline = Pipeline([
    ('preprocessing', transformer),
    ('estimator', rfc)
])

In [34]:
%%time
param_grid = {
    'estimator__n_estimators': [1, 100, 150],
    'estimator__max_depth': [50, 100, 150],
    'estimator__criterion': ['gini', 'entropy', 'log_loss']
}

grid_scv = GridSearchCV(estimator=full_pipeline, param_grid=param_grid, cv=3)
grid_scv.fit(X_train, y_train)

CPU times: user 15.3 s, sys: 58.3 ms, total: 15.3 s
Wall time: 15.5 s


In [35]:
grid_scv.best_params_

{'estimator__criterion': 'gini',
 'estimator__max_depth': 50,
 'estimator__n_estimators': 150}

In [36]:
best_estimator = grid_scv.best_estimator_
best_estimator.fit(X_train, y_train)

In [37]:
best_estimator.score(X_train, y_train), best_estimator.score(X_test, y_test)

(0.9877408056042032, 0.8461538461538461)