<a href="https://colab.research.google.com/github/student-monika/Marvel_tasks_Level_2/blob/main/Ensemble_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier


In [None]:
# Load the dataset
data = pd.read_csv('titanic.csv')

# Feature engineering (basic preprocessing)
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

# Encoding categorical variables
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])
data['Embarked'] = le.fit_transform(data['Embarked'])

# Selecting features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the first few rows of the preprocessed data
print(X_train.head())

     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
331       1    1  45.5      0      0  28.5000         2
733       2    1  23.0      0      0  13.0000         2
382       3    1  32.0      0      0   7.9250         2
704       3    1  26.0      1      0   7.8542         2
813       3    0   6.0      4      2  31.2750         2


In [None]:
# Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic regression Accuracy: {accuracy:.2f}")

Logistic regression Accuracy: 0.81


In [None]:
# Bagging using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions and accuracy
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.8212290502793296


In [None]:
# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

# Predictions and accuracy
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.8100558659217877


In [None]:
from sklearn.ensemble import StackingClassifier

# Base models
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
]

# Meta-model
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_model.fit(X_train, y_train)

# Predictions and accuracy
y_pred_stack = stacking_model.predict(X_test)
print("Stacking Accuracy:", accuracy_score(y_test, y_pred_stack))


Stacking Accuracy: 0.8268156424581006


In [None]:
# Train-validation split for blending
X_train_blend, X_val, y_train_blend, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Base models
model1 = RandomForestClassifier(n_estimators=100, random_state=42)
model2 = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train base models
model1.fit(X_train_blend, y_train_blend)
model2.fit(X_train_blend, y_train_blend)

# Make predictions on validation set
pred1 = model1.predict(X_val)
pred2 = model2.predict(X_val)

# Combine predictions and use a meta-model
meta_model = LogisticRegression()
meta_features = np.column_stack((pred1, pred2))
meta_model.fit(meta_features, y_val)

# Test predictions
test_pred1 = model1.predict(X_test)
test_pred2 = model2.predict(X_test)
test_meta_features = np.column_stack((test_pred1, test_pred2))
y_pred_blend = meta_model.predict(test_meta_features)

print("Blending Accuracy:", accuracy_score(y_test, y_pred_blend))


Blending Accuracy: 0.8156424581005587
