In [3]:
import pandas as pd
pd.__version__

'2.3.3'

In [4]:
df = pd.read_csv("../data/Titanic-Dataset.csv")
df.shape

(891, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [8]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [9]:
y_train = df_train.Survived.values
y_val = df_val.Survived.values

train_dicts = df_train.drop("Survived", axis=1).to_dict(orient="records")
val_dicts = df_val.drop("Survived", axis=1).to_dict(orient="records")

In [10]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_val)
acc_lr = accuracy_score(y_val, y_pred_lr)

print("Logistic Regression accuracy:", acc_lr)

Logistic Regression accuracy: 0.7865168539325843


Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

for d in [3, 5, 7, 10, None]:
    dt = DecisionTreeClassifier(max_depth=d, random_state=1)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    print(d, accuracy_score(y_val, y_pred))




3 0.8258426966292135
5 0.8146067415730337
7 0.8258426966292135
10 0.797752808988764
None 0.7752808988764045


In [13]:
dt = DecisionTreeClassifier(max_depth=3, random_state=1)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_val)
acc_dt = accuracy_score(y_val, y_pred_dt)

print("Decision Tree accuracy:", acc_dt)

Decision Tree accuracy: 0.8258426966292135


Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=1)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
acc_rf = accuracy_score(y_val, y_pred_rf)

print("Random Forest accuracy:", acc_rf)

Random Forest accuracy: 0.8258426966292135


Gradient Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=1)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_val)
acc_gb = accuracy_score(y_val, y_pred_gb)

print("Gradient Boosting accuracy:", acc_gb)


Gradient Boosting accuracy: 0.797752808988764


XGBoost

In [17]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    'eta': 0.1,
    'max_depth': 6,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

model_xgb = xgb.train(params, dtrain, num_boost_round=200)

y_pred_xgb = model_xgb.predict(dval)
y_pred_xgb_bin = (y_pred_xgb >= 0.5).astype(int)

from sklearn.metrics import accuracy_score
acc_xgb = accuracy_score(y_val, y_pred_xgb_bin)

print("XGBoost accuracy:", acc_xgb)


XGBoost accuracy: 0.8426966292134831


In [18]:
results = {
    "Logistic Regression": acc_lr,
    "Decision Tree": acc_dt,
    "Random Forest": acc_rf,
    "Gradient Boosting": acc_gb,
    "XGBoost": acc_xgb  # if used
}

results

{'Logistic Regression': 0.7865168539325843,
 'Decision Tree': 0.8258426966292135,
 'Random Forest': 0.8258426966292135,
 'Gradient Boosting': 0.797752808988764,
 'XGBoost': 0.8426966292134831}

In [20]:
import numpy as np
X_full_train = xgb.DMatrix(
    data = np.vstack([X_train.toarray(), X_val.toarray()]),
    label = np.concatenate([y_train, y_val])
)

xgb_params = {
    'eta': 0.1,
    'max_depth': 4,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

model = xgb.train(xgb_params, X_full_train, num_boost_round=200)

In [22]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

dtest = xgb.DMatrix(X_test)
y_pred = model.predict(dtest)
y_pred_binary = (y_pred >= 0.5)


accuracy = accuracy_score(y_test, y_pred_binary)
print("Test Accuracy:", accuracy)


NameError: name 'X_test' is not defined