In [84]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn import tree
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

## STACKING  OF AN HETROGENOUS MODEL

In [2]:
df = pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
x = df.drop(columns=['quality'])
y = df.quality

In [4]:
train , val_train , test , val_test  = train_test_split(x,y, test_size = .50 , random_state = 42)
print("train shape" , train.shape)
print("val_train shape" , val_train.shape)
print("test shape" , test.shape)
print(val_test.shape)

train shape (799, 11)
val_train shape (800, 11)
test shape (799,)
(800,)


In [5]:
x_train , x_test , y_train, y_test = train_test_split(train ,test, random_state = 42 , test_size = .20)
print("x_train shape" , x_train.shape)
print("x_test shape" , x_test.shape)
print("y_train shape" , y_train.shape)
print("y_test", y_test.shape)

x_train shape (639, 11)
x_test shape (160, 11)
y_train shape (639,)
y_test (160,)


## KNN 

In [6]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

In [7]:
print("TEST SCORE : - " , knn.score(x_test , y_test)*100)    ## TEST SCORE
print("TRAIN SCORE :- " , knn.score(x_train, y_train)*100)   ### TRAIN SCORE

TEST SCORE : -  47.5
TRAIN SCORE :-  64.7887323943662


## SVC

In [8]:
svc = SVC()
svc.fit(x_train,y_train)

In [9]:
print("TEST SCORE : -" , svc.score(x_test, y_test)*100)   ### TEST SCORE
print("TRAIN SCORE  : -", svc.score(x_train , y_train)*100) ### TRAIN SCORE

TEST SCORE : - 52.5
TRAIN SCORE  : - 51.4866979655712


## PREDICTED DATA  : -

In [10]:
predcition_knn = knn.predict(val_train)  ## PREDICTION OF VALIDATION DATA BY USING KNN MODEL
prediction_svc = svc.predict(val_train)  ## PREDICTION OF  VALIDATION DATA BY USING SVC MODEL

In [11]:
# predcition_knn

In [12]:
# prediction_svc

## STACKING OF THE PREDICTED DATA (prediction_knn , prediction_svc)

In [13]:
input3 = np.column_stack((predcition_knn , prediction_svc))  # (PREDICTION KNN + PREDICTION SVC)

In [14]:
output = val_test   ###  NOW WE USE OUTPUT DATA FOR THE PREDICTION BY USING THE STACKING MODEL (PREDICTION_KNN + PREDICTION_SVC)

In [15]:
output   # SEE THE OUTPUT DATA 

803     6
124     5
350     6
682     5
1326    6
       ..
519     5
931     5
867     6
182     5
1088    7
Name: quality, Length: 800, dtype: int64

In [16]:
pd.DataFrame(input3)   ## VISUALIZE THE STACKING MODEL 

Unnamed: 0,0,1
0,5,5
1,5,5
2,6,6
3,5,6
4,6,6
...,...,...
795,6,5
796,6,6
797,6,6
798,5,6


In [17]:
rf = RandomForestClassifier()

In [18]:
rf.fit(input3,output)   ##  FITTING THE INPUT (predcition_knn , prediction_svc) AND THE OUTPUT (output = val_test)

In [19]:
# rf.score(x_test,y_test)    ## GIVES THE FOLLOWING ERROR : - 
                        ## X has 11 features, but RandomForestClassifier is expecting 2 features as input.

In [20]:
knn_output = knn.predict(x_test)
svc_output = svc.predict(x_test)


In [21]:
knn_output

array([6, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6, 7, 6, 6, 5, 5, 5, 5, 5, 5, 6, 6,
       6, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 5, 4, 6, 6, 5, 6, 6, 5,
       6, 5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6,
       5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 5,
       6, 5, 6, 6, 5, 5, 7, 6, 7, 7, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6,
       6, 7, 5, 6, 6, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5,
       6, 5, 6, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5,
       5, 5, 6, 5, 5, 6], dtype=int64)

In [22]:
svc_output

array([6, 6, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6, 6, 5, 5, 5, 5, 6, 5, 6, 5, 6,
       6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 5, 6, 5, 6,
       6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5,
       6, 6, 5, 5, 6, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 6, 6,
       6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6,
       6, 5, 6, 6, 6, 6], dtype=int64)

In [23]:
output_stack1 = np.column_stack ((knn_output,svc_output))  

In [24]:
rf.predict(output_stack1)

array([6, 6, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6,
       6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5, 7, 6, 6, 5, 6, 6, 5,
       6, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 6,
       5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 5,
       6, 5, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5,
       6, 6, 5, 5, 6, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5,
       6, 5, 6, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 5, 5, 5, 5,
       5, 5, 6, 5, 5, 6], dtype=int64)

In [25]:
print("SCORE OF THE STACKING MODE : -" ,rf.score(output_stack1 , y_test)*100)

SCORE OF THE STACKING MODE : - 50.0


In [46]:
####################################################################################

In [47]:
df = pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [48]:
x = df.drop(columns=['quality'])
y = df.quality

In [49]:
train , val_train , test , val_test  = train_test_split(x,y, test_size = .50 , random_state = 42)
print("train shape" , train.shape)
print("val_train shape" , val_train.shape)
print("test shape" , test.shape)
print(val_test.shape)

train shape (799, 11)
val_train shape (800, 11)
test shape (799,)
(800,)


In [50]:
x_train , x_test , y_train, y_test = train_test_split(train ,test, random_state = 42 , test_size = .20)
print("x_train shape" , x_train.shape)
print("x_test shape" , x_test.shape)
print("y_train shape" , y_train.shape)
print("y_test", y_test.shape)

x_train shape (639, 11)
x_test shape (160, 11)
y_train shape (639,)
y_test (160,)


In [64]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
svc = SVC()
svc.fit(x_train, y_train)
y_knn =knn.predict(val_train)
y_svc = knn.predict(val_train)

In [69]:
input3 = np.column_stack((y_knn , y_svc)) 

In [70]:
output = val_test
rf = RandomForestClassifier()
rf.fit(input3, output)

In [71]:
knn_output = knn.predict(x_test)
svc_output = knn.predict(x_test)

In [76]:
output_stack = np.column_stack((knn_output, svc_output))


In [77]:
rf.predict(output_stack)

array([6, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 6,
       6, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 5, 7, 6, 6, 5, 6, 6, 5,
       6, 5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6,
       5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 5,
       6, 5, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6,
       6, 6, 5, 6, 6, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5,
       6, 5, 6, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5,
       5, 5, 6, 5, 5, 6], dtype=int64)

In [79]:
y_test

187     5
406     6
249     6
1014    6
71      5
       ..
219     5
1459    7
387     6
913     7
1056    7
Name: quality, Length: 160, dtype: int64

In [86]:
print_score(rf, input3, output, output_stack, y_test, train=True)
print_score(rf, input3, output, output_stack, y_test, train=False)

Train Result:
Accuracy Score: 48.50%
_______________________________________________
CLASSIFICATION REPORT:
             3     4           5           6           7     8  accuracy  \
precision  0.0   0.0    0.546366    0.423559    0.500000   0.0     0.485   
recall     0.0   0.0    0.614085    0.559603    0.009901   0.0     0.485   
f1-score   0.0   0.0    0.578249    0.482168    0.019417   0.0     0.485   
support    3.0  28.0  355.000000  302.000000  101.000000  11.0     0.485   

            macro avg  weighted avg  
precision    0.244987      0.465468  
recall       0.197265      0.485000  
f1-score     0.179973      0.441068  
support    800.000000    800.000000  
_______________________________________________
Confusion Matrix: 
 [[  0   0   1   2   0   0]
 [  0   0  14  13   1   0]
 [  0   0 218 137   0   0]
 [  0   0 133 169   0   0]
 [  0   0  30  70   1   0]
 [  0   0   3   8   0   0]]

Test Result:
Accuracy Score: 48.75%
_______________________________________________
CLASS

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
