In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mlt
import seaborn as sns
titanic = sns.load_dataset('titanic')

In [170]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [171]:
titanic.info(
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [172]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [173]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [174]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
titanic['age'] = imputer.fit_transform(titanic[['age']])
imputer2=SimpleImputer(strategy='most_frequent')
titanic[['embarked', 'embark_town']] = imputer2.fit_transform(titanic[['embarked', 'embark_town']])


In [175]:
titanic['deck'].value_counts(dropna=False)

deck
NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: count, dtype: int64

In [176]:
titanic['class'].value_counts(dropna=False)

class
Third     491
First     216
Second    184
Name: count, dtype: int64

In [177]:
def fill_deck(row):
    if pd.isnull(row['deck']):
        if row['pclass']==1:
            return 'C'
        elif row['pclass']==2:
            return 'E'
        else:
            return 'G'

    else:
        return row['deck']             

titanic['deck'] = titanic.apply(fill_deck, axis=1)

In [178]:
titanic.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [179]:
titanic.select_dtypes(include=['object', 'category']).columns


Index(['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive'], dtype='object')

In [180]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_cols = ['age', 'fare', 'sibsp', 'parch']
categorical_cols = ['sex', 'embarked']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],remainder='passthrough')
 
titanic_df = preprocessor.fit_transform(titanic)


In [181]:
titanic_df.shape

(891, 18)

In [182]:
encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
final_cols = numeric_cols + list(encoded_cols) + [
    col for col in titanic.columns if col not in (numeric_cols + categorical_cols)
]

titanic_df = pd.DataFrame(preprocessor.fit_transform(titanic), columns=final_cols)

# Now you can inspect it
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          891 non-null    object
 1   fare         891 non-null    object
 2   sibsp        891 non-null    object
 3   parch        891 non-null    object
 4   sex_female   891 non-null    object
 5   sex_male     891 non-null    object
 6   embarked_C   891 non-null    object
 7   embarked_Q   891 non-null    object
 8   embarked_S   891 non-null    object
 9   survived     891 non-null    object
 10  pclass       891 non-null    object
 11  class        891 non-null    object
 12  who          891 non-null    object
 13  adult_male   891 non-null    object
 14  deck         891 non-null    object
 15  embark_town  891 non-null    object
 16  alive        891 non-null    object
 17  alone        891 non-null    object
dtypes: object(18)
memory usage: 125.4+ KB


In [183]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          891 non-null    object
 1   fare         891 non-null    object
 2   sibsp        891 non-null    object
 3   parch        891 non-null    object
 4   sex_female   891 non-null    object
 5   sex_male     891 non-null    object
 6   embarked_C   891 non-null    object
 7   embarked_Q   891 non-null    object
 8   embarked_S   891 non-null    object
 9   survived     891 non-null    object
 10  pclass       891 non-null    object
 11  class        891 non-null    object
 12  who          891 non-null    object
 13  adult_male   891 non-null    object
 14  deck         891 non-null    object
 15  embark_town  891 non-null    object
 16  alive        891 non-null    object
 17  alone        891 non-null    object
dtypes: object(18)
memory usage: 125.4+ KB


In [184]:
titanic_df.head()

Unnamed: 0,age,fare,sibsp,parch,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,survived,pclass,class,who,adult_male,deck,embark_town,alive,alone
0,-0.592481,-0.502445,0.432793,-0.473674,0.0,1.0,0.0,0.0,1.0,0,3,Third,man,True,G,Southampton,no,False
1,0.638789,0.786845,0.432793,-0.473674,1.0,0.0,1.0,0.0,0.0,1,1,First,woman,False,C,Cherbourg,yes,False
2,-0.284663,-0.488854,-0.474545,-0.473674,1.0,0.0,0.0,0.0,1.0,1,3,Third,woman,False,G,Southampton,yes,True
3,0.407926,0.42073,0.432793,-0.473674,1.0,0.0,0.0,0.0,1.0,1,1,First,woman,False,C,Southampton,yes,False
4,0.407926,-0.486337,-0.474545,-0.473674,0.0,1.0,0.0,0.0,1.0,0,3,Third,man,True,G,Southampton,no,True


In [185]:
numeric_like = ['age', 'fare', 'sibsp', 'parch'] + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))
titanic_df[numeric_like] = titanic_df[numeric_like].apply(pd.to_numeric)


In [186]:
titanic_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          891 non-null    float64
 1   fare         891 non-null    float64
 2   sibsp        891 non-null    float64
 3   parch        891 non-null    float64
 4   sex_female   891 non-null    float64
 5   sex_male     891 non-null    float64
 6   embarked_C   891 non-null    float64
 7   embarked_Q   891 non-null    float64
 8   embarked_S   891 non-null    float64
 9   survived     891 non-null    object 
 10  pclass       891 non-null    object 
 11  class        891 non-null    object 
 12  who          891 non-null    object 
 13  adult_male   891 non-null    object 
 14  deck         891 non-null    object 
 15  embark_town  891 non-null    object 
 16  alive        891 non-null    object 
 17  alone        891 non-null    object 
dtypes: float64(9), object(9)
memory usage: 125.4+ KB


In [187]:
titanic_df['family_size'] = titanic['sibsp'] + titanic['parch'] + 1


In [188]:
titanic_df.drop(['sibsp', 'parch', 'embark_town', 'alive',  'class', 'who', 'adult_male', 'sex_female'], axis=1, inplace=True)


In [189]:
titanic_df.head()

Unnamed: 0,age,fare,sex_male,embarked_C,embarked_Q,embarked_S,survived,pclass,deck,alone,family_size
0,-0.592481,-0.502445,1.0,0.0,0.0,1.0,0,3,G,False,2
1,0.638789,0.786845,0.0,1.0,0.0,0.0,1,1,C,False,2
2,-0.284663,-0.488854,0.0,0.0,0.0,1.0,1,3,G,True,1
3,0.407926,0.42073,0.0,0.0,0.0,1.0,1,1,C,False,2
4,0.407926,-0.486337,1.0,0.0,0.0,1.0,0,3,G,True,1


In [190]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
titanic_df['deck_encoded'] = le.fit_transform(titanic_df['deck'])
titanic_df.drop(columns=['deck'], inplace=True)

In [191]:
titanic_df['survived'] = titanic_df['survived'].astype(int)
titanic_df['pclass'] = titanic_df['pclass'].astype(int)

cols_to_int = ['sex_male', 'embarked_C', 'embarked_Q', 'embarked_S']
titanic_df[cols_to_int] = titanic_df[cols_to_int].astype(int)

In [193]:
titanic_df['alone'] = titanic_df['alone'].astype(int)


In [194]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           891 non-null    float64
 1   fare          891 non-null    float64
 2   sex_male      891 non-null    int32  
 3   embarked_C    891 non-null    int32  
 4   embarked_Q    891 non-null    int32  
 5   embarked_S    891 non-null    int32  
 6   survived      891 non-null    int32  
 7   pclass        891 non-null    int32  
 8   alone         891 non-null    int32  
 9   family_size   891 non-null    int64  
 10  deck_encoded  891 non-null    int32  
dtypes: float64(2), int32(8), int64(1)
memory usage: 48.9 KB


In [195]:
from sklearn.model_selection import train_test_split

X = titanic_df.drop('survived', axis=1)
y = titanic_df['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [196]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8044692737430168


In [198]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [199]:
rf_model = RandomForestClassifier(
    n_estimators=200,      
    max_depth=8,            
    random_state=42,
    class_weight='balanced' 
)

rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [200]:
y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8156424581005587

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179


Confusion Matrix:
 [[91 14]
 [19 55]]


In [201]:
pip install xgboost


Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 2.4 MB/s eta 0:00:30
    --------------------------------------- 1.3/72.0 MB 3.1 MB/s eta 0:00:24
   - -------------------------------------- 1.8/72.0 MB 3.1 MB/s eta 0:00:23
   - -------------------------------------- 2.9/72.0 MB 3.4 MB/s eta 0:00:21
   -- ------------------------------------- 3.7/72.0 MB 3.5 MB/s eta 0:00:20
   -- ------------------------------------- 4.5/72.0 MB 3.5 MB/s eta 0:00:20
   -- ------------------------------------- 5.2/72.0 MB 3.5 MB/s eta 0:00:19
   --- ------------------------------------ 6.0/72.0 MB 3.6 MB/s eta 0:00:19
   --- ------------------------------------ 6.8/72.0 MB 3.6 MB/s eta 0:00:19
   ---- ----------------

In [204]:
from xgboost import XGBClassifier

In [205]:
xgb_model = XGBClassifier(
    n_estimators=300,     
    learning_rate=0.05,     
    max_depth=5,            
    subsample=0.8,         
    colsample_bytree=0.8,  
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'   
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8212290502793296

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85       105
           1       0.78      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.82      0.82      0.82       179


Confusion Matrix:
 [[89 16]
 [16 58]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [206]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

xgb_search = RandomizedSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42,
    verbose=1
)

xgb_search.fit(X_train, y_train)
print("Best parameters:", xgb_search.best_params_)
print("Best CV accuracy:", xgb_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best parameters: {'subsample': 0.7, 'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
Best CV accuracy: 0.8370530877573131


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [207]:
from xgboost import XGBClassifier

best_xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.7,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

best_xgb.fit(X_train, y_train)
y_pred = best_xgb.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       105
           1       0.83      0.70      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.81       179



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
