<a href="https://colab.research.google.com/github/ruanwensheng/Machine-Learning-Algorithms-/blob/ensemble_rf/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy scikit-learn xgboost



In [39]:
import pandas as pd
df_train = pd.read_csv('/content/train.csv') # only train has labels
df_test = pd.read_csv('/content/test.csv')

test_ids = df_test["PassengerId"]
df_train.head()
cols = df_train.columns.tolist()
print(cols)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [15]:
def clean(data):
    # Drop only existing columns
    drop_cols = ["Ticket", "Cabin", "Name", "PassengerId"]
    data = data.drop(columns=[col for col in drop_cols if col in data.columns])

    # Fill numeric columns with median
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        if col in data.columns:
            data[col].fillna(data[col].median(), inplace=True)

    # Fill 'Embarked' with 'U'
    if 'Embarked' in data.columns:
        data['Embarked'].fillna("U", inplace=True)

    return data

# Apply
df = clean(df_train)
test = clean(df_test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna("U", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [14]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [17]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

cols_notnum = ['Sex','Embarked']
for i in cols_notnum:
  df[i]= le.fit_transform(df[i])
  test[i]= le.transform(test[i])
  print(le.classes_)

df.head(20)



[0 1]
[0 1 2 3]


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2
5,0,3,1,28.0,0,0,8.4583,1
6,0,1,1,54.0,0,0,51.8625,2
7,0,3,1,2.0,3,1,21.075,2
8,1,3,0,27.0,0,2,11.1333,2
9,1,2,0,14.0,1,0,30.0708,0


In [18]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV



In [19]:
y = df['Survived']
X = df.drop("Survived", axis =1)

# split into train and validation

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42 )

**LOGISTICREG**

In [21]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(random_state = 42, max_iter = 1000)

model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_val)

print(f'accuracy score: {accuracy_score(y_val, y_pred_lr): 2f}')
print(f'precision score: {precision_score(y_val, y_pred_lr): 2f}')
print(f'recall score: {recall_score(y_val, y_pred_lr): 2f}')
print(f'f1 score: { f1_score(y_val, y_pred_lr): 2f}')

accuracy score:  0.810056
precision score:  0.785714
recall score:  0.743243
f1 score:  0.763889


**BASELINE for XGBOOST**

In [20]:
model_0 = xgb.XGBClassifier(random_state = 42)
model_0.fit(X_train, y_train)

y_pred = model_0.predict(X_val)

print(f'accuracy score: {accuracy_score(y_val, y_pred): 2f}')
print(f'precision score: {precision_score(y_val, y_pred): 2f}')
print(f'recall score: {recall_score(y_val, y_pred): 2f}')
print(f'f1 score: { f1_score(y_val, y_pred): 2f}')

accuracy score:  0.798883
precision score:  0.756757
recall score:  0.756757
f1 score:  0.756757


**GRID SEARCH FOR XGBOOST**

In [22]:
import time
parameters = {
    'n_estimators': [50,100,200,500,1000],
    "learning_rate":[0.1,0.3,0.6,0.8,1.0],
    'max_depth':[1,3,6,10],
    'reg_alpha':[0, 0.1, 0.5,1],
    'reg_lambda':[0.1,0.5,1,1.5]
}

g = GridSearchCV(xgb.XGBClassifier(random_state = 42), parameters, cv = 5,n_jobs=-1 )

start_time = time.time()
g.fit(X_train, y_train)

duration = time.time()-start_time

# best para
print(g.best_params_)


{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 1}


In [25]:
model_1 = g.best_estimator_

y_pred = model_1.predict(X_val)

print(f'accuracy score: {accuracy_score(y_val, y_pred): 2f}')
print(f'precision score: {precision_score(y_val, y_pred): 2f}')
print(f'recall score: {recall_score(y_val, y_pred): 2f}')
print(f'f1 score: { f1_score(y_val, y_pred): 2f}')
print(f'duration: { duration: 2f}')

accuracy score:  0.821229
precision score:  0.828125
recall score:  0.716216
f1 score:  0.768116
duration:  576.136518


In [27]:
!pip install scipy



**RANDOMIZED SEARCH for XGBOOST**

In [28]:
from scipy.stats import poisson, uniform

parameters2 = {
    'n_estimators': poisson(mu = 500),
    "learning_rate":uniform(),
    'max_depth':poisson(mu=6),
    'reg_alpha':uniform(loc =0, scale = 2),
    'reg_lambda':uniform(loc=0,scale=2)
}

r = RandomizedSearchCV(xgb.XGBClassifier(random_state=42), parameters2,cv =5,
                       n_iter=100,random_state =42,n_jobs=-1)


start_time = time.time()
r.fit(X_train, y_train)

duration2 = time.time()-start_time

# best para
print(r.best_params_)

{'learning_rate': np.float64(0.12887972191064923), 'max_depth': 3, 'n_estimators': 477, 'reg_alpha': np.float64(1.7935768198120237), 'reg_lambda': np.float64(0.9479232805257447)}


In [29]:
model_2 = r.best_estimator_

y_pred = model_2.predict(X_val)

print(f'accuracy score: {accuracy_score(y_val, y_pred): 2f}')
print(f'precision score: {precision_score(y_val, y_pred): 2f}')
print(f'recall score: {recall_score(y_val, y_pred): 2f}')
print(f'f1 score: { f1_score(y_val, y_pred): 2f}')
print(f'duration: { duration2: 2f}')

accuracy score:  0.815642
precision score:  0.836066
recall score:  0.689189
f1 score:  0.755556
duration:  50.581043


**BAYESIAN OPTIMIZATION**

In [30]:
!pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [36]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer
parameters3 = {
    'n_estimators': Integer(50, 1000, prior='uniform'),
    'learning_rate': Real(0.0001, 1, prior='log-uniform'),
    'max_depth': Integer(1, 10, prior='uniform'),
    'reg_alpha': Real(0.0001, 2, prior='log-uniform'),
    'reg_lambda': Real(0.0001, 2, prior='log-uniform')
}

b = BayesSearchCV(
    xgb.XGBClassifier(random_state=42),
    parameters3,
    cv=5,
    n_iter=5,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
b.fit(X_train, y_train)
duration3 = time.time() - start_time

# best para
print(r.best_params_)

{'learning_rate': np.float64(0.12887972191064923), 'max_depth': 3, 'n_estimators': 477, 'reg_alpha': np.float64(1.7935768198120237), 'reg_lambda': np.float64(0.9479232805257447)}


In [37]:
model_3 = b.best_estimator_

y_pred = model_3.predict(X_val)

print(f'accuracy score: {accuracy_score(y_val, y_pred): 2f}')
print(f'precision score: {precision_score(y_val, y_pred): 2f}')
print(f'recall score: {recall_score(y_val, y_pred): 2f}')
print(f'f1 score: { f1_score(y_val, y_pred): 2f}')
print(f'duration: { duration3: 2f}')

accuracy score:  0.798883
precision score:  0.779412
recall score:  0.716216
f1 score:  0.746479
duration:  6.008068


In [41]:
submission_preds = model_2.predict(test)
df_sub = pd.DataFrame({"PassengerId": test_ids.values,
                       "Survived": submission_preds})

In [42]:
df_sub.to_csv("submission.csv", index = False)