In [24]:
import pandas as pd

In [25]:
# Load the file
file_url = 'https://raw.githubusercontent.com/sedeba19/Chapter-15/main/data_source_15/Chapter15_Dataset_crx.data.txt'

In [26]:
# Create a dataframe
df = pd.read_csv(file_url,
                 sep = ',',
                 header = None,
                 na_values = '?')
df.shape

(690, 16)

In [27]:
# Changing the Classes to 1 & 0
df.loc[df[15] == '+' , 15] = 1
df.loc[df[15] == '-' , 15] = 0
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [28]:
# Find number of null values in the dataset
df.isna().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [29]:
# Print Shape and data types
print('Shape of raw data set', df.shape)
print('Data types of data set', df.dtypes)

Shape of raw data set (690, 16)
Data types of data set 0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15     object
dtype: object


In [30]:
# Drop all the rows with na values
df_clean = df.dropna(axis = 0)

In [31]:
df_clean_cat = df_clean.select_dtypes(include = 'object')
df_clean_cat

Unnamed: 0,0,3,4,5,6,8,9,11,12,15
0,b,u,g,w,v,t,t,f,g,1
1,a,u,g,q,h,t,t,f,g,1
2,a,u,g,q,h,t,f,f,g,1
3,b,u,g,w,v,t,t,t,g,1
4,b,u,g,w,v,t,f,f,s,1
...,...,...,...,...,...,...,...,...,...,...
685,b,y,p,e,h,f,f,f,g,0
686,a,u,g,c,v,f,t,t,g,0
687,a,y,p,ff,ff,f,t,t,g,0
688,b,u,g,aa,v,f,f,f,g,0


In [32]:
df_clean_cat.shape

(653, 10)

In [33]:
df_clean_cat.drop(15, axis = 1, inplace = True)

In [34]:
df_clean_cat

Unnamed: 0,0,3,4,5,6,8,9,11,12
0,b,u,g,w,v,t,t,f,g
1,a,u,g,q,h,t,t,f,g
2,a,u,g,q,h,t,f,f,g
3,b,u,g,w,v,t,t,t,g
4,b,u,g,w,v,t,f,f,s
...,...,...,...,...,...,...,...,...,...
685,b,y,p,e,h,f,f,f,g
686,a,u,g,c,v,f,t,t,g
687,a,y,p,ff,ff,f,t,t,g
688,b,u,g,aa,v,f,f,f,g


In [35]:
df_clean_cat.columns

Int64Index([0, 3, 4, 5, 6, 8, 9, 11, 12], dtype='int64')

In [36]:
# Separate the categorical variables to make dummy variables
new_df_cat = pd.get_dummies(df_clean_cat[df_clean_cat.columns])
new_df_cat.shape

(653, 40)

In [37]:
# Separate the numerical variables
df_num = df_clean.select_dtypes(include = 'number')
df_num

Unnamed: 0,1,2,7,10,13,14
0,30.83,0.000,1.25,1,202.0,0
1,58.67,4.460,3.04,6,43.0,560
2,24.50,0.500,1.50,0,280.0,824
3,27.83,1.540,3.75,5,100.0,3
4,20.17,5.625,1.71,0,120.0,0
...,...,...,...,...,...,...
685,21.08,10.085,1.25,0,260.0,0
686,22.67,0.750,2.00,2,200.0,394
687,25.25,13.500,2.00,1,200.0,1
688,17.92,0.205,0.04,0,280.0,750


In [38]:
# Make a new X variable which is a concatenation of categorical and numerical data
X = pd.concat([new_df_cat,
               df_num],
               axis = 1)

# Make label variable
y = pd.Series(df_clean[15],
              dtype = 'int')

In [39]:
X

Unnamed: 0,0_a,0_b,3_l,3_u,3_y,4_g,4_gg,4_p,5_aa,5_c,...,11_t,12_g,12_p,12_s,1,2,7,10,13,14
0,0,1,0,1,0,1,0,0,0,0,...,0,1,0,0,30.83,0.000,1.25,1,202.0,0
1,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,58.67,4.460,3.04,6,43.0,560
2,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,24.50,0.500,1.50,0,280.0,824
3,0,1,0,1,0,1,0,0,0,0,...,1,1,0,0,27.83,1.540,3.75,5,100.0,3
4,0,1,0,1,0,1,0,0,0,0,...,0,0,0,1,20.17,5.625,1.71,0,120.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,21.08,10.085,1.25,0,260.0,0
686,1,0,0,1,0,1,0,0,0,1,...,1,1,0,0,22.67,0.750,2.00,2,200.0,394
687,1,0,0,0,1,0,0,1,0,0,...,1,1,0,0,25.25,13.500,2.00,1,200.0,1
688,0,1,0,1,0,1,0,0,1,0,...,0,1,0,0,17.92,0.205,0.04,0,280.0,750


In [40]:
y

0      1
1      1
2      1
3      1
4      1
      ..
685    0
686    0
687    0
688    0
689    0
Name: 15, Length: 653, dtype: int32

In [41]:
# Normalize the dataset using the MinMaxScaler() function
from sklearn.preprocessing import MinMaxScaler
minmaxScaler = MinMaxScaler()

# Convert dtypes into str
X.columns = X.columns.astype(str)

# Apply the MinMaxScaler()
X_transformed = pd.DataFrame(minmaxScaler.fit_transform(X))
X_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.271111,0.000000,0.043860,0.014925,0.1010,0.00000
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.713016,0.159286,0.106667,0.089552,0.0215,0.00560
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.170635,0.017857,0.052632,0.000000,0.1400,0.00824
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.223492,0.055000,0.131579,0.074627,0.0500,0.00003
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.101905,0.200893,0.060000,0.000000,0.0600,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.116349,0.360179,0.043860,0.000000,0.1300,0.00000
649,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.141587,0.026786,0.070175,0.029851,0.1000,0.00394
650,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.182540,0.482143,0.070175,0.014925,0.1000,0.00001
651,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.066190,0.007321,0.001404,0.000000,0.1400,0.00750


In [42]:
# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed,
                                                    y,
                                                    test_size= 0.3,
                                                    random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((457, 46), (196, 46), (457,), (196,))

Bagging

In [43]:
# Defining the base learner
from sklearn.linear_model import LogisticRegression
bl1 = LogisticRegression(random_state=123)
# Creating the bagging meta learner
from sklearn.ensemble import BaggingClassifier
baggingLearner = BaggingClassifier(base_estimator=bl1,
                                   n_estimators=15,
                                   max_samples=0.7,
                                   max_features=0.8)
# Fitting the model using the meta learner
model = baggingLearner.fit(X_train, y_train)
# Predicting on the test set using the model
pred = model.predict(X_test)

# Printing the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred))

# Printing the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))



[[95 12]
 [ 8 81]]
              precision    recall  f1-score   support

           0       0.92      0.89      0.90       107
           1       0.87      0.91      0.89        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



Boosting

In [44]:
# Defining the base learner
from sklearn.ensemble import RandomForestClassifier
bl1 = RandomForestClassifier(random_state=123)
# Defining the boosting meta learner
from sklearn.ensemble import AdaBoostClassifier
boosting = AdaBoostClassifier(base_estimator=bl1, 
                              n_estimators=300)
# Fitting the model on the training set
model = boosting.fit(X_train, y_train)
# Getting the predictions from the boosting model
pred = model.predict(X_test)
# Printing the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred))
# Printing the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))



[[97 10]
 [ 9 80]]
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       107
           1       0.89      0.90      0.89        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



Stacking

In [45]:
# Importing the meta learner and base learners
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
bl1 = KNeighborsClassifier(n_neighbors=5)
bl2 = LogisticRegression(random_state=123) 
ml = RandomForestClassifier(random_state=123)
# Creating the stacking classifier
from mlxtend.classifier import StackingClassifier
stackclf = StackingClassifier(classifiers=[bl1, bl2],
                              meta_classifier=ml)
# Fitting the model on the training set
model = stackclf.fit(X_train, y_train)
# Generating predictions on test set
pred = model.predict(X_test)
# Printing the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred))
# Printing the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

[[99  8]
 [18 71]]
              precision    recall  f1-score   support

           0       0.85      0.93      0.88       107
           1       0.90      0.80      0.85        89

    accuracy                           0.87       196
   macro avg       0.87      0.86      0.86       196
weighted avg       0.87      0.87      0.87       196

