In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier,StackingClassifier,BaggingClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
!pip install xgboost
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
#%matplotlib inline

In [None]:
df = pd.read_csv("Wine.csv")
df

In [None]:
df['Customer_Segment'].value_counts()

In [None]:
df.describe()

In [None]:
sns.heatmap(df.isna())# to visualise null values. There are no null values

In [6]:
#Define X and Y
x=df.iloc[:,:-1] # all rows, all columns except last column
y=df['Customer_Segment']

In [7]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
#Build individual model first. Check whether single model performing well or when we bag, it is performing well.
lg=LogisticRegression()
lg.fit(xtrain,ytrain)
ypred=lg.predict(xtest)
print(classification_report(ytest,ypred))#get confusion matrix
print('Train Accuracy: ',lg.score(xtrain,ytrain))
print('Test Accuracy: ',lg.score(xtest,ytest))

              precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00         8

    accuracy                           0.97        36
   macro avg       0.98      0.98      0.98        36
weighted avg       0.97      0.97      0.97        36

Train Accuracy:  0.9647887323943662
Test Accuracy:  0.9722222222222222


In [None]:
# model is overfitted

# **Bagging** **Classifier**

In [9]:
#bg=BaggingClassifier(DecisionTreeClassifier(),n_estimators=20)
bg=BaggingClassifier(DecisionTreeClassifier())
#base_estimator: for which algorithm you want to creat bagging classifier
#n_estimators: for base estimator algo. how many models you want to create. Its a hyper parameter
#base estimator is same for all algorithm.

In [10]:
bg.fit(xtrain,ytrain)
ypred=bg.predict(xtest)
print(classification_report(ytest,ypred))
print('Train Accuracy: ',bg.score(xtrain,ytrain))
print('Test Accuracy: ',bg.score(xtest,ytest))

              precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       0.88      1.00      0.93        14
           3       1.00      0.88      0.93         8

    accuracy                           0.94        36
   macro avg       0.96      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36

Train Accuracy:  1.0
Test Accuracy:  0.9444444444444444


In [11]:
#as we are doing same for diff algorithms so create a function which will build model and print accuracy
# write prediction function
def predictor(model):
  model.fit(xtrain,ytrain)
  ypred=model.predict(xtest)
  print('Train Accuracy: ',model.score(xtrain,ytrain))
  print('Test Accuracy: ',model.score(xtest,ytest))
  print(classification_report(ytest,ypred))

In [12]:
# Call to function predictor.
predictor(DecisionTreeClassifier())

Train Accuracy:  1.0
Test Accuracy:  0.9444444444444444
              precision    recall  f1-score   support

           1       0.93      0.93      0.93        14
           2       0.93      1.00      0.97        14
           3       1.00      0.88      0.93         8

    accuracy                           0.94        36
   macro avg       0.95      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36



In [13]:
predictor(BaggingClassifier(DecisionTreeClassifier()))

Train Accuracy:  1.0
Test Accuracy:  0.9166666666666666
              precision    recall  f1-score   support

           1       0.93      0.93      0.93        14
           2       0.87      0.93      0.90        14
           3       1.00      0.88      0.93         8

    accuracy                           0.92        36
   macro avg       0.93      0.91      0.92        36
weighted avg       0.92      0.92      0.92        36



In [14]:
predictor(LogisticRegression())

Train Accuracy:  0.9647887323943662
Test Accuracy:  0.9722222222222222
              precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00         8

    accuracy                           0.97        36
   macro avg       0.98      0.98      0.98        36
weighted avg       0.97      0.97      0.97        36



In [15]:
predictor(KNeighborsClassifier()) # No feature scaling done so less accuracy

Train Accuracy:  0.7535211267605634
Test Accuracy:  0.7222222222222222
              precision    recall  f1-score   support

           1       0.86      0.86      0.86        14
           2       0.79      0.79      0.79        14
           3       0.38      0.38      0.38         8

    accuracy                           0.72        36
   macro avg       0.67      0.67      0.67        36
weighted avg       0.72      0.72      0.72        36



In [16]:
predictor(BaggingClassifier(KNeighborsClassifier())) # may be after bagging (for default 10 KNN algorithms) accuracy will increase

Train Accuracy:  0.7816901408450704
Test Accuracy:  0.7222222222222222
              precision    recall  f1-score   support

           1       0.86      0.86      0.86        14
           2       0.90      0.64      0.75        14
           3       0.42      0.62      0.50         8

    accuracy                           0.72        36
   macro avg       0.72      0.71      0.70        36
weighted avg       0.78      0.72      0.74        36



# **Boosting** **Classifier**

In [17]:
predictor(AdaBoostClassifier())
# huge difference between train and test accuracy - uses stump

Train Accuracy:  0.9225352112676056
Test Accuracy:  0.9166666666666666
              precision    recall  f1-score   support

           1       1.00      0.86      0.92        14
           2       0.82      1.00      0.90        14
           3       1.00      0.88      0.93         8

    accuracy                           0.92        36
   macro avg       0.94      0.91      0.92        36
weighted avg       0.93      0.92      0.92        36



In [18]:
predictor(GradientBoostingClassifier())# much better than adaboost - as uses fully grown tree, it works on residuals (tries to correct previous errors)
# this is only on one train and test data. Try for kfold

Train Accuracy:  1.0
Test Accuracy:  0.9444444444444444
              precision    recall  f1-score   support

           1       0.93      1.00      0.97        14
           2       0.93      0.93      0.93        14
           3       1.00      0.88      0.93         8

    accuracy                           0.94        36
   macro avg       0.95      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36



K-Fold Cross Validation

In [19]:
kf=KFold(n_splits=10)
score=cross_val_score(GradientBoostingClassifier(),x,y,cv=kf)
score

array([0.94444444, 0.88888889, 0.94444444, 0.88888889, 0.83333333,
       1.        , 0.94444444, 0.83333333, 0.88235294, 1.        ])

In [20]:
# gradient boosting is performing well. Final accuracy will be avg of all
score.mean()

0.9160130718954248

In [21]:
predictor(XGBClassifier()) # some may get error.
# in target column, Customer_Segment we have class numbers as 1,2,3
#new version requires classification should start from 0. It expects class as 0,1,2


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1 2 3]

In [None]:
# to change 1,2,3 to 0,1,2 perform label encoding
le = LabelEncoder()
y=le.fit_transform(y)
y

In [24]:
df['Customer_Segment']=y
x=df.iloc[:,:-1] # all rows, all columns except last column
y=df['Customer_Segment']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)

In [25]:
predictor(XGBClassifier())

Train Accuracy:  1.0
Test Accuracy:  0.9722222222222222
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       1.00      0.92      0.96        13
           2       1.00      1.00      1.00         9

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36



# **Voting and Stacking Classifier**

In [26]:
# create a list of algorithms
models=[]
models.append(('lr',LogisticRegression()))
models.append(('dt',DecisionTreeClassifier()))
models.append(('dt1',DecisionTreeClassifier(criterion='entropy')))
models.append(('knn',KNeighborsClassifier()))
models.append(('rf',RandomForestClassifier()))

In [27]:
predictor(VotingClassifier(estimators=models))


Train Accuracy:  1.0
Test Accuracy:  0.9444444444444444
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        14
           1       0.92      0.92      0.92        13
           2       1.00      1.00      1.00         9

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.94      0.94      0.94        36



In [28]:
predictor(StackingClassifier(estimators=models,final_estimator=RandomForestClassifier()))
#suppose we have x,y and we are using M1,M2,M3. Outputs of these models are say y1,y2,y3
# if you are using final model as Random Forest model then it will use y and y1,y2,y3 as x variables


Train Accuracy:  1.0
Test Accuracy:  0.9722222222222222
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       1.00      0.92      0.96        13
           2       1.00      1.00      1.00         9

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36

