<a href="https://colab.research.google.com/github/sapan-s2/AI_Austin_course/blob/main/Baging_Boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor,RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split


In [None]:
#Loading dataset
data=pd.read_csv("/content/Cars-dataset.csv")

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.isna().sum()


In [None]:
data.nunique()

In [None]:
sns.set(rc={'figure.figsize':(16,10)})
sns.heatmap(data.corr(),
            annot=True,
            linewidths=.5,
            center=0,
            cbar=False,
            cmap="Spectral")
plt.show()

In [None]:
# prompt: create dummies for gender col

data['Gender'] = data['Gender'].astype('category')
data = pd.get_dummies(data, columns=['Gender'])


In [None]:
# prompt: Split the data into a 70:30 ratio

X = data.drop(['Opt_service'], axis=1)
y = data['Opt_service']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# prompt: What is the percentage of 0 and 1 classes in the test data (y_test)?

print(y_test.value_counts()/len(y_test)*100)


In [None]:
y_test.value_counts()/len(y_test)

In [None]:
# prompt: Build a bagging classifier with default parameters

#Fitting the model
bagging_classifier = BaggingClassifier(random_state=1)
bagging_classifier.fit(X_train,y_train)


In [None]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1,
        },
        index=[0],
    )

    return df_perf

In [None]:
def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")


In [None]:


# Libtune to tune model, get different metric scores
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score,f1_score,roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
#Calculating different metrics
bagging_classifier_model_train_perf=model_performance_classification_sklearn(bagging_classifier,X_train,y_train)
print("Training performance:\n",bagging_classifier_model_train_perf)
bagging_classifier_model_test_perf=model_performance_classification_sklearn(bagging_classifier,X_test,y_test)
print("Testing performance:\n",bagging_classifier_model_test_perf)
#Creating confusion matrix
confusion_matrix_sklearn(bagging_classifier, X_test, y_test)

In [None]:
# prompt: How many employees who would take the service are correctly identified by the model from the training data

bagging_classifier_model_train_perf['Recall']


In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
# prompt: import logisticregression

from sklearn.linear_model import LogisticRegression


In [None]:
# prompt: Build a random forest classifier with default parameters and a bagging classifier with logistic regression as the base estimator

random_forest_classifier = RandomForestClassifier(random_state=1)
bagging_classifier_l = BaggingClassifier(base_estimator=LogisticRegression(),random_state=1)



In [None]:
bagging_classifier_l.fit(X_train,y_train)

In [None]:

random_forest_classifier.fit(X_train,y_train)

In [None]:
#Calculating different metrics
bagging_classifier_model_train_perf_l=model_performance_classification_sklearn(bagging_classifier_l,X_train,y_train)
print("Training performance:\n",bagging_classifier_model_train_perf_l)
# bagging_classifier_model_test_perf_l=model_performance_classification_sklearn(bagging_classifier_l,X_test,y_test)
# print("Testing performance:\n",bagging_classifier_model_test_perf_l)
# #Creating confusion matrix
# confusion_matrix_sklearn(bagging_classifier_l, X_test, y_test)

In [None]:
#Calculating different metrics
random_forest_classifier_model_train_perf_l=model_performance_classification_sklearn(random_forest_classifier,X_train,y_train)
print("Training performance:\n",random_forest_classifier_model_train_perf_l)
# random_forest_classifier_model_test_perf_l=model_performance_classification_sklearn(random_forest_classifier,X_test,y_test)
# print("Testing performance:\n",random_forest_classifier_model_test_perf_l)
# #Creating confusion matrix
# confusion_matrix_sklearn(bagging_classifier_l, X_test, y_test)

In [None]:
# prompt: bagging classifier model with the base estimator as a decision tree.Vary the depth of the base estimator/Decision tree from depth 1 to 5

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Create a BaggingClassifier with a DecisionTreeClassifier as the base estimator
bagging_classifier_d = BaggingClassifier(base_estimator=DecisionTreeClassifier())

# Create a grid search object to find the optimal depth for the DecisionTreeClassifier
grid_search = GridSearchCV(bagging_classifier_d, param_grid={'base_estimator__max_depth': range(1, 6)}, scoring='f1')

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by the grid search
print(grid_search.best_params_)

# Print the accuracy score of the best model on the test data
best_model = grid_search.best_estimator_
accuracy = accuracy_score(y_test, best_model.predict(X_test))
print(accuracy)


In [None]:
scores = []
for i in range(1,6):
 bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1,max_depth = i),random_state=1)
 bag.fit(X_train, y_train)
 pred = bag.predict(X_train)
 case = {'Depth':i,'F1 Score':f1_score(y_train,pred)}
 scores.append(case)
print(scores)

In [None]:
# prompt: import XGBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier


In [None]:
# prompt: Which of the boosting models (AdaBoost, GradientBoost, XGBoost) with default parameters has the lowest f1-score on the training set. Set the eval_metric = 'logloss' for XGBoostClassifier

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score

# Create models
ada_boost = AdaBoostClassifier(random_state=1)
grad_boost = GradientBoostingClassifier(random_state=1)

xgb_boost=XGBRegressor(random_state=1,eval_metric='logloss')
# xgb_estimator.fit(X_train,y_train)

# xgb_boost = XGBoostClassifier(eval_metric='logloss')

# Fit models
ada_boost.fit(X_train, y_train)
grad_boost.fit(X_train, y_train)
xgb_boost.fit(X_train, y_train)

# Evaluate models
ada_boost_f1 = f1_score(y_train, ada_boost.predict(X_train))
print("ada_boost_f1", ada_boost_f1)
grad_boost_f1 = f1_score(y_train, grad_boost.predict(X_train))
print("grad_boost_f1", grad_boost_f1)
xgb_boost_f1 = f1_score(y_train, xgb_boost.predict(X_train).round())

# Find model with lowest f1-score
lowest_f1_model = min([ada_boost_f1, grad_boost_f1])

# Print model with lowest f1-score
print(f'Model with lowest f1-score on training set: {lowest_f1_model}')


In [None]:
# prompt: f1 score for xgboost

xgb_boost_f1 = f1_score(y_train, xgb_boost.predict(X_train))


In [None]:
# prompt: Use the gradient boosting classifier trained in Q6  and plot the feature importance of the variable

# feature_importances = xgb_boost.feature_importances_
# # Make a sorted list of feature importances
# sorted_importances = sorted(feature_importances, reverse=True)
# # Create a bar chart of the feature importances
# plt.bar(range(len(sorted_importances)), sorted_importances)
# plt.xlabel("Feature")
# plt.ylabel("Importance")
# plt.title("Feature Importance")
# plt.show()

feature_names = X_train.columns
importances = xgb_boost.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


In [None]:
# prompt: Train three models:  1. Model1 = Gradient Boosting classifier with n_estimator = 50 and learning rate = 0.01 2.  Model2 = Gradient Boosting classifier with n_estimator = 100 and learning rate = 0.01 3. Model3 = Gradient Boosting classifier with n_estimator = 400 and learning rate = 0.01  f1_score1, f1_score2, f1_score3 are f1_scores of the three models respectively.  order of f1_score on the training set

model1 = GradientBoostingClassifier(n_estimators=50, learning_rate=0.01)
model1.fit(X_train, y_train)
f1_score1 = f1_score(y_train, model1.predict(X_train))

model2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01)
model2.fit(X_train, y_train)
f1_score2 = f1_score(y_train, model2.predict(X_train))

model3 = GradientBoostingClassifier(n_estimators=400, learning_rate=0.01)
model3.fit(X_train, y_train)
f1_score3 = f1_score(y_train, model3.predict(X_train))

print(f"f1_score1: {f1_score1}")
print(f"f1_score2: {f1_score2}")
print(f"f1_score3: {f1_score3}")

if f1_score1 > f1_score2 and f1_score1 > f1_score3:
  print("f1_score1 is the highest")
elif f1_score2 > f1_score1 and f1_score2 > f1_score3:
  print("f1_score2 is the highest")
else:
  print("f1_score3 is the highest")


In [None]:
.98 > 0.0

In [None]:
# prompt: What is the order of f1_score on the training set in above q?

xgb_boost_f1 = f1_score(y_train, xgb_boost.predict(X_train).round())
grad_boost_f1 = f1_score(y_train, grad_boost.predict(X_train))
ada_boost_f1 = f1_score(y_train, ada_boost.predict(X_train))

# Print the f1-scores in order from highest to lowest
print(f'f1-scores on training set in order from highest to lowest:')
print(f'XGBoost: {xgb_boost_f1}')
print(f'Gradient Boosting: {grad_boost_f1}')
print(f'AdaBoost: {ada_boost_f1}')


In [None]:
# prompt: Build a stacking classifier using two models - Decision Tree, Bagging Classifier as base estimators and use Random Forest as the final estimator

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Create a Bagging Classifier using Decision Tree as the base estimator
bagging_classifier = BaggingClassifier(base_estimator=dt_classifier,random_state=1)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=1)

# Create a StackingClassifier using Decision Tree and Bagging Classifier as base estimators and Random Forest as the final estimator
stacking_classifier = StackingClassifier(estimators=[('dt_classifier', dt_classifier), ('bagging_classifier', bagging_classifier)], final_estimator=rf_classifier)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=1)

# Fit the stacking classifier to the training data
stacking_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = stacking_classifier.predict(X_test)

# Evaluate the accuracy of the stacking classifier
# accuracy = accuracy_score(y_test, stacking_classifier.predict(X_train) )
# print('Accuracy:', accuracy)

print(f1_score(y_train, stacking_classifier.predict(X_train)))
print(recall_score(y_train, stacking_classifier.predict(X_train)))


In [None]:
# prompt: Stacking classifier f1 score and recall score on training set

print(f1_score(y_train, stacking_classifier.predict(X_train)))
print(recall_score(y_train, stacking_classifier.predict(X_train)))


In [None]:
# prompt: Build a stacking classifier using two models - AdaBoost classifier, and Gradient Boosting Classifier as base estimators and use XGBoost as the final estimator. random_state=1, f1 and recall score on test set

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import GridSearchCV



# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create the base models
ada_boost = AdaBoostClassifier(random_state=1)
grad_boost = GradientBoostingClassifier(random_state=1)

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=[('ada_boost', ada_boost), ('grad_boost', grad_boost)], final_estimator=XGBClassifier(random_state=1))

# Fit the stacking classifier to the training data
stacking_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = stacking_classifier.predict(X_test)

# Evaluate the accuracy of the stacking classifier
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"f1_score: {f1}")
print(f"recall_score: {recall}")
