In [1]:
!kaggle datasets download -d yasserh/titanic-dataset

Dataset URL: https://www.kaggle.com/datasets/yasserh/titanic-dataset
License(s): CC0-1.0
Downloading titanic-dataset.zip to /content
  0% 0.00/22.0k [00:00<?, ?B/s]
100% 22.0k/22.0k [00:00<00:00, 26.5MB/s]


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
!unzip /content/titanic-dataset.zip

Archive:  /content/titanic-dataset.zip
  inflating: Titanic-Dataset.csv     


In [4]:
df = pd.read_csv('/content/Titanic-Dataset.csv')

In [5]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [6]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True )

In [7]:
X = df.drop('Survived', axis=1)
y = df['Survived'].copy()

In [8]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.2500
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.9250
3,1,female,35.0,1,0,53.1000
4,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000
887,1,female,19.0,0,0,30.0000
888,3,female,,1,2,23.4500
889,1,male,26.0,0,0,30.0000


In [9]:
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [13]:
num_features = X_train.select_dtypes(include=[np.number]).columns
cat_features = X_train.select_dtypes(exclude=[np.number]).columns

In [14]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('object', cat_pipeline, cat_features)
], remainder='passthrough')

In [15]:
transformer.fit(X_train)

In [16]:
X_Train_transformed_df = pd.DataFrame(data = transformer.transform(X_train), columns=transformer.get_feature_names_out())
X_test_transformed_df = pd.DataFrame(data = transformer.transform(X_test), columns=transformer.get_feature_names_out())

In [17]:
rfc = RandomForestClassifier(n_estimators=1)

In [18]:
rfc.fit(X_Train_transformed_df, y_train)

In [19]:
rfc.score(X_Train_transformed_df, y_train), rfc.score(X_test_transformed_df, y_test)

(0.9002808988764045, 0.7094972067039106)

In [20]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [21]:
%%time
param_grid = {
    'n_estimators': [1, 100, 150],
    'max_depth': [50, 100, 150],
    'criterion': ['gini', 'entropy', 'log_loss']
}

grid_scv = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3)
grid_scv.fit(X_Train_transformed_df, y_train)

CPU times: user 14.3 s, sys: 84.2 ms, total: 14.4 s
Wall time: 14.5 s


In [22]:
grid_scv.best_params_

{'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 100}

In [23]:
best_estimator = grid_scv.best_estimator_
best_estimator.fit(X_Train_transformed_df, y_train)

In [24]:
best_estimator.score(X_Train_transformed_df, y_train), best_estimator.score(X_test_transformed_df, y_test)

(0.9803370786516854, 0.8100558659217877)

# Grid Search with Full Pipeline

In [25]:
full_pipeline = Pipeline([
    ('preprocessing', transformer),
    ('estimator', rfc)
])

In [26]:
%%time
param_grid = {
    'estimator__n_estimators': [1, 100, 150],
    'estimator__max_depth': [50, 100, 150],
    'estimator__criterion': ['gini', 'entropy', 'log_loss']
}
grid_svc = GridSearchCV(estimator=full_pipeline, param_grid=param_grid, cv=3)
grid_svc.fit(X_train, y_train)

CPU times: user 15.4 s, sys: 89.5 ms, total: 15.5 s
Wall time: 15.6 s


In [27]:
grid_scv.best_params_

{'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 100}

In [28]:
best_estimator = grid_scv.best_estimator_
best_estimator.fit(X_train, y_train)

ValueError: could not convert string to float: 'male'

In [None]:
best_estimator.score(X_train, y_train), best_estimator.score(X_test, y_test)

ValueError: could not convert string to float: 'male'

In [None]:
aids_df = pd.read_csv('AIDS_Classification_50000.csv')

In [None]:
aids_df

In [None]:
aids_df.describe()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
aids_df

In [None]:
aids_df.info()

In [None]:
aids_df.isna().sum()

In [None]:
X = aids_df.drop('infected', axis=1)
y = aids_df['infected'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
num_features = X_train.select_dtypes(include=[np.number]).columns
cat_features = X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('object', cat_pipeline, cat_features)
], remainder='passthrough')

In [None]:
transformer.fit(X_train)

In [None]:
X_train_transformer_df = pd.DataFrame(data=transformer.transform(X_train), columns=transformer.get_feature_names_out())
X_test_transformer_df = pd.DataFrame(data=transformer.transform(X_test), columns=transformer.get_feature_names_out())

In [None]:
rfc = RandomForestClassifier(n_estimators=1)