### Simulating Weighted Sampling

In [5]:
import pandas as pd 
import numpy as np 

# Load Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

# Select a small subset
df_small = df[['Pclass', 'Sex', 'Age', 'Survived']].head(5)

# Assign initial weights
weights = np.array([1/5] * 5)
print("Initial Weights:", weights)

# Simulate misclassification (e.g., last instance is wrong)
misclassified = [False, False, False, False, True]
weights[misclassified] *= 2  # Increase weight of misclassified
print("Weights after misclassification adjustment:", weights)

weights /= weights.sum()  # Normalize weights
print("Normalized Weights:", weights)

# Sample with weights
weighted_sample = df_small.sample(n=5, replace=True, weights=weights, random_state=42)
print("Original Subset:")
print(df_small)
print("\nWeighted Sample:")
print(weighted_sample)

Initial Weights: [0.2 0.2 0.2 0.2 0.2]
Weights after misclassification adjustment: [0.2 0.2 0.2 0.2 0.4]
Normalized Weights: [0.16666667 0.16666667 0.16666667 0.16666667 0.33333333]
Original Subset:
   Pclass     Sex   Age  Survived
0       3    male  22.0         0
1       1  female  38.0         1
2       3  female  26.0         1
3       1  female  35.0         1
4       3    male  35.0         0

Weighted Sample:
   Pclass     Sex   Age  Survived
2       3  female  26.0         1
4       3    male  35.0         0
4       3    male  35.0         0
3       1  female  35.0         1
0       3    male  22.0         0


In [None]:
# Simulate two iterations of weight updates with four instances
for iteration in range(2):
    # Randomly select four instances from the original subset
    selected_indices = np.random.choice(df_small.index, size=4, replace=False)
    selected_instances = df_small.loc[selected_indices]
    
    # Assign new weights for the selected instances
    new_weights = np.array([1/4] * 4)
    
    # Simulate misclassification for the selected instances
    misclassified = [False, True, False, True]  # Example misclassification
    new_weights[misclassified] *= 2  # Increase weight of misclassified
    print(f"\nIteration {iteration + 1} Weights before normalization:", new_weights)
    
    new_weights /= new_weights.sum()  # Normalize weights
    print(f"Iteration {iteration + 1} Normalized Weights:", new_weights)
    
    # Sample with new weights
    weighted_sample_iter = selected_instances.sample(n=4, replace=True, weights=new_weights, random_state=42)
    print(f"Weighted Sample for Iteration {iteration + 1}:")
    print(weighted_sample_iter)

    # Weighting misclassified instances improves model performance by continuously focusing on harder-to-classify instances.
    # This iterative process allows the model to adapt and improve its accuracy over time.


Iteration 1 Weights before normalization: [0.25 0.5  0.25 0.5 ]
Iteration 1 Normalized Weights: [0.16666667 0.33333333 0.16666667 0.33333333]
Weighted Sample for Iteration 1:
   Pclass     Sex   Age  Survived
0       3    male  22.0         0
1       1  female  38.0         1
1       1  female  38.0         1
2       3  female  26.0         1

Iteration 2 Weights before normalization: [0.25 0.5  0.25 0.5 ]
Iteration 2 Normalized Weights: [0.16666667 0.33333333 0.16666667 0.33333333]
Weighted Sample for Iteration 2:
   Pclass     Sex   Age  Survived
1       1  female  38.0         1
0       3    male  22.0         0
0       3    male  22.0         0
4       3    male  35.0         0


Imagine a dataset with three passengers: A, B, and C, each with weight 1/3. The first weak learner correctly predicts A and B but misclassifies C. C’s weight increases to 1/2, while A and B’s weights drop to 1/4. The next learner focuses on C, correcting its prediction but perhaps misclassifying A. This process continues, with the final model combining all predictions, weighted by each learner’s accuracy.

### AdaBoost Classifier

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score

# Load and preprocess Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
df = df[['Pclass', 'Sex', 'Age', 'Fare', 'Survived']]
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Split data
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Single decision stump
dt = DecisionTreeClassifier(max_depth=1, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print(f"Single Decision Stump Accuracy: {accuracy_score(y_test, y_pred_dt):.2f}")

Single Decision Stump Accuracy: 0.78


In [16]:
# AdaBoost with decision stumps
ada = AdaBoostClassifier(estimator=dt, n_estimators=50, random_state=42)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_ada):.2f}")

AdaBoost Accuracy: 0.80


In [17]:
# AdaBoost with decision stumps using Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, random_state=42)
ada_log_reg = AdaBoostClassifier(estimator=log_reg, n_estimators=50, random_state=42)
ada_log_reg.fit(X_train, y_train)
y_pred_ada_log_reg = ada_log_reg.predict(X_test)
print(f"AdaBoost with Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_ada_log_reg):.2f}")

AdaBoost with Logistic Regression Accuracy: 0.79


In [19]:
# Add SibSp and Parch features

# Reload df with SibSp and Parch columns
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
df = df[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Survived']]
df['Age'] = df['Age'].fillna(df['Age'].median())
df['SibSp'] = df['SibSp'].fillna(0)
df['Parch'] = df['Parch'].fillna(0)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Split data with additional features
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Rerun AdaBoost with decision stumps
ada = AdaBoostClassifier(estimator=dt, n_estimators=50, random_state=42)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
print(f"AdaBoost with Additional Features Accuracy: {accuracy_score(y_test, y_pred_ada):.2f}")

AdaBoost with Additional Features Accuracy: 0.80


In [11]:
# Bagging for comparison
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42)
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
print(f"Bagging Accuracy: {accuracy_score(y_test, y_pred_bagging):.2f}")

Bagging Accuracy: 0.78


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Cross-validation for AdaBoost
cv_scores_ada = cross_val_score(ada, X, y, cv=5)
print(f"AdaBoost CV Accuracy: {cv_scores_ada.mean():.2f} (+/- {cv_scores_ada.std() * 2:.2f})")

# Detailed metrics
print("\nAdaBoost Classification Report:")
print(classification_report(y_test, y_pred_ada))

AdaBoost CV Accuracy: 0.80 (+/- 0.06)

AdaBoost Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       105
           1       0.79      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



In [21]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, y_pred_gb):.2f}")

Gradient Boosting Accuracy: 0.80


In [22]:
# Compare feature importance
importances = gb.feature_importances_
feature_names = X.columns
print("\nGradient Boosting Feature Importance:")
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")


Gradient Boosting Feature Importance:
Pclass: 0.1563
Sex: 0.4779
Age: 0.1326
Fare: 0.1722
SibSp: 0.0539
Parch: 0.0071
