<a href="https://colab.research.google.com/github/qedir051/ML-Projects/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
train.drop(['Name','Ticket', 'Cabin'], axis = 1, inplace = True)

In [None]:
X = train.drop('Survived', axis = 1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

In [None]:
cat_fat = X_train.select_dtypes(include = 'object').columns
num_fat = X_train.select_dtypes(exclude = 'object').columns
cat_fat, num_fat

(Index(['Sex', 'Embarked'], dtype='object'),
 Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))

In [None]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown = 'ignore'))
])

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
])

transformer = ColumnTransformer([
    ('cat_pipeline', cat_pipeline, cat_fat),
    ('num_pipeline', num_pipeline, num_fat)
])

train_transformed = transformer.fit_transform(X_train)
test_transformed = transformer.transform(X_test)

feature_names = transformer.get_feature_names_out()

X_train_transformed = pd.DataFrame(train_transformed, columns = feature_names)
X_test_transformed = pd.DataFrame(test_transformed, columns = feature_names)

X_train_transformed.head()

Unnamed: 0,cat_pipeline__Sex_female,cat_pipeline__Sex_male,cat_pipeline__Embarked_C,cat_pipeline__Embarked_Q,cat_pipeline__Embarked_S,num_pipeline__PassengerId,num_pipeline__Pclass,num_pipeline__Age,num_pipeline__SibSp,num_pipeline__Parch,num_pipeline__Fare
0,0.0,1.0,0.0,0.0,1.0,332.0,1.0,45.5,0.0,0.0,28.5
1,0.0,1.0,0.0,0.0,1.0,734.0,2.0,23.0,0.0,0.0,13.0
2,0.0,1.0,0.0,0.0,1.0,383.0,3.0,32.0,0.0,0.0,7.925
3,0.0,1.0,0.0,0.0,1.0,705.0,3.0,26.0,1.0,0.0,7.8542
4,1.0,0.0,0.0,0.0,1.0,814.0,3.0,6.0,4.0,2.0,31.275


In [None]:
rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc.fit(X_train_transformed, y_train)

In [None]:
rfc.score(X_test_transformed, y_test)

0.8324022346368715

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
     'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']

}

grid_search = GridSearchCV(rfc, param_grid, cv = 5, scoring = 'neg_mean_squared_error')
grid_search.fit(X_train_transformed, y_train)

In [None]:
grid_search.best_params_

{'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 100}

In [None]:
best_model = grid_search.best_estimator_
best_model.score(X_test_transformed, y_test)

0.8156424581005587