# Week9 - Ensemble Assignment

- create a training and test set with random_state = 3
- create a pipeline to extract new features
- try bagging & boosting algorithms


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

names = 'https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/names/us_names.csv'

df = pd.read_csv(names)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27999 entries, 0 to 27999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    27999 non-null  object
 1   gender  27999 non-null  object
dtypes: object(2)
memory usage: 656.2+ KB


## Divide into X (name) and y (gender) column

In [2]:
X = df[['name']]
y = df.gender

## Train and test split with random state of 3

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

In [4]:
print(f'Training samples: {X_train.shape[0]:,}')
print(f'Test samples: {X_test.shape[0]:,}')

Training samples: 22,399
Test samples: 5,600


## Creating new features for the above data

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

    
class MyFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit( self, X, y = None ):
        return self
    
    def transform(self, X, y=None):
    
        X['first_letter'] = [name[0] for name in X['name']]
        X['last_letter'] = [name[-1] for name in X['name']]
    
        return X[['first_letter', 'last_letter']].values

pipe = Pipeline([ 
    ("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
  
    # model can be created
])


Xmatrix = pipe.fit_transform(X,y)
Xmatrix

<27999x52 sparse matrix of type '<class 'numpy.float64'>'
	with 55998 stored elements in Compressed Sparse Row format>

In [6]:
pipe[1].get_feature_names_out()

array(['x0_a', 'x0_b', 'x0_c', 'x0_d', 'x0_e', 'x0_f', 'x0_g', 'x0_h',
       'x0_i', 'x0_j', 'x0_k', 'x0_l', 'x0_m', 'x0_n', 'x0_o', 'x0_p',
       'x0_q', 'x0_r', 'x0_s', 'x0_t', 'x0_u', 'x0_v', 'x0_w', 'x0_x',
       'x0_y', 'x0_z', 'x1_a', 'x1_b', 'x1_c', 'x1_d', 'x1_e', 'x1_f',
       'x1_g', 'x1_h', 'x1_i', 'x1_j', 'x1_k', 'x1_l', 'x1_m', 'x1_n',
       'x1_o', 'x1_p', 'x1_q', 'x1_r', 'x1_s', 'x1_t', 'x1_u', 'x1_v',
       'x1_w', 'x1_x', 'x1_y', 'x1_z'], dtype=object)

In [7]:
pd.DataFrame(Xmatrix.toarray(), columns=pipe[1].get_feature_names_out(), dtype=int)

Unnamed: 0,x0_a,x0_b,x0_c,x0_d,x0_e,x0_f,x0_g,x0_h,x0_i,x0_j,...,x1_q,x1_r,x1_s,x1_t,x1_u,x1_v,x1_w,x1_x,x1_y,x1_z
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27994,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27995,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
27996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27997,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# BAGGING - DECISION TREE CLASSIFIER

In [8]:
from sklearn.ensemble import BaggingClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

In [9]:
pipe = Pipeline([ 
    ("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ("bagging", BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42))
])

In [10]:
pipe.fit(X_train,y_train)

## EVALUATION OF TRAINING DATA

In [11]:
y_train_pred = pipe.predict(X_train)

In [12]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, y_train_pred))

[[8521 2755]
 [2860 8263]]


In [13]:
from sklearn.metrics import accuracy_score

dt_train_accuracy = accuracy_score(y_train,y_train_pred)

print("Accuracy of bagging decision tree is: ",dt_train_accuracy*100)

Accuracy of bagging decision tree is:  74.9319166034198


In [14]:
from sklearn.metrics import classification_report
report = classification_report(y_train, y_train_pred)
print(report)

              precision    recall  f1-score   support

           F       0.75      0.76      0.75     11276
           M       0.75      0.74      0.75     11123

    accuracy                           0.75     22399
   macro avg       0.75      0.75      0.75     22399
weighted avg       0.75      0.75      0.75     22399



## EVALUATION OF TESTING DATA

In [15]:
y_test_pred = pipe.predict(X_test)

In [16]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_test_pred))

[[2051  672]
 [ 745 2132]]


In [17]:
from sklearn.metrics import accuracy_score

dt_test_accuracy = accuracy_score(y_test,y_test_pred)

print("Accuracy of bagging decision tree is: ",dt_test_accuracy*100)

Accuracy of bagging decision tree is:  74.69642857142857


In [18]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_test_pred)
print(report)

              precision    recall  f1-score   support

           F       0.73      0.75      0.74      2723
           M       0.76      0.74      0.75      2877

    accuracy                           0.75      5600
   macro avg       0.75      0.75      0.75      5600
weighted avg       0.75      0.75      0.75      5600



# ADABOOST - DECISION TREE CLASSIFIER

In [19]:
from sklearn.ensemble import AdaBoostClassifier

pipe = Pipeline([ 
    ("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ("adaboost", AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=500, learning_rate=0.1, random_state=1))
])

ada = pipe.fit(X_train, y_train)
ada_train_predict = ada.predict(X_train)
ada_test_predict = ada.predict(X_test)

ada_train_score = accuracy_score(y_train, ada_train_predict)
ada_test_score = accuracy_score(y_test, ada_test_predict)

print(f'AdaBoost Training Score: {ada_train_score:.2%}')
print(f'AdaBoost Test Score: {ada_test_score:.2%}')

AdaBoost Training Score: 74.98%
AdaBoost Test Score: 74.93%


In [20]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, ada_train_predict))

[[8451 2825]
 [2780 8343]]


In [21]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, ada_test_predict))

[[2040  683]
 [ 721 2156]]


In [22]:
from sklearn.metrics import classification_report
report = classification_report(y_train, ada_train_predict)
print(report)

              precision    recall  f1-score   support

           F       0.75      0.75      0.75     11276
           M       0.75      0.75      0.75     11123

    accuracy                           0.75     22399
   macro avg       0.75      0.75      0.75     22399
weighted avg       0.75      0.75      0.75     22399



In [23]:
from sklearn.metrics import classification_report
report = classification_report(y_test, ada_test_predict)
print(report)

              precision    recall  f1-score   support

           F       0.74      0.75      0.74      2723
           M       0.76      0.75      0.75      2877

    accuracy                           0.75      5600
   macro avg       0.75      0.75      0.75      5600
weighted avg       0.75      0.75      0.75      5600



# BAGGING - LOGISTIC REGRESSION

In [24]:
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([ 
    ("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ("bagging_lr", BaggingClassifier(estimator=LogisticRegression(), n_estimators=10, random_state=42))
])

In [25]:
pipe_lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## EVALUATION OF TRAINING DATA

In [26]:
y_train_p = pipe_lr.predict(X_train)

In [27]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, y_train_p))

[[8434 2842]
 [2915 8208]]


In [28]:
from sklearn.metrics import accuracy_score

lr_train_accuracy = accuracy_score(y_train,y_train_p)

print("Accuracy of bagging logistic regression is: ",lr_train_accuracy*100)

Accuracy of bagging logistic regression is:  74.29795973034511


In [29]:
from sklearn.metrics import classification_report
report = classification_report(y_train, y_train_p)
print(report)

              precision    recall  f1-score   support

           F       0.74      0.75      0.75     11276
           M       0.74      0.74      0.74     11123

    accuracy                           0.74     22399
   macro avg       0.74      0.74      0.74     22399
weighted avg       0.74      0.74      0.74     22399



## EVALUATION OF TESTING DATA

In [30]:
y_test_p = pipe_lr.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_test_p))

[[2048  675]
 [ 743 2134]]


In [32]:
from sklearn.metrics import accuracy_score

lr_test_accuracy = accuracy_score(y_test,y_test_p)

print("Accuracy of bagging logistic regression is: ",lr_test_accuracy*100)

Accuracy of bagging logistic regression is:  74.67857142857143


In [33]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_test_p)
print(report)

              precision    recall  f1-score   support

           F       0.73      0.75      0.74      2723
           M       0.76      0.74      0.75      2877

    accuracy                           0.75      5600
   macro avg       0.75      0.75      0.75      5600
weighted avg       0.75      0.75      0.75      5600



# ADABOOST - LOGISTIC REGRESSION

In [34]:
from sklearn.ensemble import AdaBoostClassifier

pipe_lra = Pipeline([ 
    ("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ("adaboost", AdaBoostClassifier(estimator=LogisticRegression(), n_estimators=500, learning_rate=0.1, random_state=1))
])

ada = pipe_lra.fit(X_train, y_train)
ada_train_predict = ada.predict(X_train)
ada_test_predict = ada.predict(X_test)

ada_train_score_lr = accuracy_score(y_train, ada_train_predict)
ada_test_score_lr = accuracy_score(y_test, ada_test_predict)

print(f'AdaBoost Training Score: {ada_train_score_lr:.2%}')
print(f'AdaBoost Test Score: {ada_test_score_lr:.2%}')

AdaBoost Training Score: 73.45%
AdaBoost Test Score: 74.12%


# GRID SEARCH CV - LOGISTIC REGRESSION

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

p1 = Pipeline([("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
               ("ohe", OneHotEncoder(handle_unknown="ignore")),
               ('lr', LogisticRegression(solver='liblinear'))
])

params = {'lr__C': [0.01, 0.1, 1, 10]}

lr_gscv = GridSearchCV(p1, param_grid=params, cv=10, scoring='accuracy', refit=True)
lr_gscv = lr_gscv.fit(X_train, y_train)

print(f'Validation score: {lr_gscv.best_score_:.2%}')

lr_pred = lr_gscv.predict(X_test)

print(f'Test score: {lr_gscv.score(X_test, y_test):.2%}')

Validation score: 74.24%
Test score: 74.68%


# GRID SEARCH CV - DECISION TREE

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

p2 = Pipeline([("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
             ('dt', DecisionTreeClassifier())])

params = {'dt__max_depth': [1, 5, 10, 15, 25],
         'dt__min_samples_split': [3, 10, 15]}

dt_gscv = GridSearchCV(p2, param_grid=params, cv=10, scoring='accuracy', refit=True)
dt_gscv = dt_gscv.fit(X_train, y_train)

print(f'Validation score: {dt_gscv.best_score_:.2%}')

dt_pred = dt_gscv.predict(X_test)

print(f'Test score: {dt_gscv.score(X_test, y_test):.2%}')

Validation score: 74.36%
Test score: 74.98%


# GRADIENT BOOSTING CLASSIFIER

In [37]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

p3 = Pipeline([("feature_engineering", MyFeatures()),
    # ("selector_new", FeatureSelector(["daily_trend"])),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
             ('gb', GradientBoostingClassifier())])

params = {'gb__max_depth': [1,2,3],
          'gb__n_estimators': [50, 100, 200]
         }

gb_gscv = GridSearchCV(p3, param_grid = params, cv=10, scoring='accuracy')
           
gb_gscv = gb_gscv.fit(X_train, y_train)

print(f'Validation score: {gb_gscv.best_score_:.2%}')

gb_pred = gb_gscv.predict(X_test)

print(f'Test score: {gb_gscv.score(X_test, y_test):.2%}')

Validation score: 74.27%
Test score: 74.80%


# COMPARISON OF MODELS OF TRAINING DATA

In [38]:
# Compare Several models according to their Accuracies
print('Accuracy of all models for the training dataset')

Model_Comparison_train = pd.DataFrame({
    'Model': ['Bagging Decision Tree','Boosting Decision Tree','Bagging Logistic Regression','Boosting Logistic Regression'],
    'Score': [dt_train_accuracy, ada_train_score, lr_train_accuracy,ada_train_score_lr]})

Model_Comparison_df = Model_Comparison_train.sort_values(by='Score', ascending=False)
Model_Comparison_df = Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

Accuracy of all models for the training dataset


Unnamed: 0,Score,Model
0,0.749766,Boosting Decision Tree
1,0.749319,Bagging Decision Tree
2,0.74298,Bagging Logistic Regression
3,0.734497,Boosting Logistic Regression


# COMPARISON OF MODELS OF TESTING DATA

In [39]:
# Compare Several models according to their Accuracies
print('Accuracy of all models for the testing dataset')

Model_Comparison_test = pd.DataFrame({
    'Model': ['Bagging Decision Tree','Boosting Decision Tree','Bagging Logistic Regression','Boosting Logistic Regression'],
    'Score': [dt_test_accuracy, ada_test_score, lr_test_accuracy,ada_test_score_lr]})

Model_Comparison_df = Model_Comparison_test.sort_values(by='Score', ascending=False)
Model_Comparison_df = Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

Accuracy of all models for the testing dataset


Unnamed: 0,Score,Model
0,0.749286,Boosting Decision Tree
1,0.746964,Bagging Decision Tree
2,0.746786,Bagging Logistic Regression
3,0.74125,Boosting Logistic Regression


**I have implemented Bagging and Boosting classifiers for Decision Tree and Logistic Regression. I have also implemented Grid Search CV for logistic regression and decision tree. Gradient Boosting Classifier is also implemented. The accuracies of logistic regression and decision tree 75 both respectively.**