# PROBLEM 0

In [1]:
# Importing necessary libraries
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier

# Loading the wine dataset
data = datasets.load_wine(as_frame = True)

# Loading x and y variables
X = data.data
y = data.target

# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 22)

# PROBLEM 1
Fit and evaluate the AdaBoostClassifier from sklearn.ensemble on the wine dataset. Use the evaluate model to print results.

In [2]:
# evaluate method to print results after training a particular model
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")
    
abc = AdaBoostClassifier(n_estimators=30)
model = abc.fit(X_train, y_train)
evaluate(model, X_train, X_test, y_train, y_test)

TRAINIG RESULTS: 
CONFUSION MATRIX:
[[46  0  0]
 [ 0 54  0]
 [ 0  0 33]]
ACCURACY SCORE:
1.0000
CLASSIFICATION REPORT:
              0     1     2  accuracy  macro avg  weighted avg
precision   1.0   1.0   1.0       1.0        1.0           1.0
recall      1.0   1.0   1.0       1.0        1.0           1.0
f1-score    1.0   1.0   1.0       1.0        1.0           1.0
support    46.0  54.0  33.0       1.0      133.0         133.0
TESTING RESULTS: 
CONFUSION MATRIX:
[[12  1  0]
 [ 0 16  1]
 [ 0  2 13]]
ACCURACY SCORE:
0.9111
CLASSIFICATION REPORT:
                   0          1          2  accuracy  macro avg  weighted avg
precision   1.000000   0.842105   0.928571  0.911111   0.923559      0.916541
recall      0.923077   0.941176   0.866667  0.911111   0.910307      0.911111
f1-score    0.960000   0.888889   0.896552  0.911111   0.915147      0.911986
support    13.000000  17.000000  15.000000  0.911111  45.000000     45.000000


# PROBLEM 2
Retrieve the frequent itemsets using the `apriori` method from mlxtend.frequent_patterns. The code below extracts the basket_sets and this is provided as input for the apriori method.

In [3]:
# Loading the dataset file
df = pd.read_csv('../input/affinity-analysis/BreadBasket_DMS.csv')

In [4]:
df['Quantity'] = 1 
df.head(7)

Unnamed: 0,Date,Time,Transaction,Item,Quantity
0,2016-10-30,09:58:11,1,Bread,1
1,2016-10-30,10:05:34,2,Scandinavian,1
2,2016-10-30,10:05:34,2,Scandinavian,1
3,2016-10-30,10:07:57,3,Hot chocolate,1
4,2016-10-30,10:07:57,3,Jam,1
5,2016-10-30,10:07:57,3,Cookies,1
6,2016-10-30,10:08:41,4,Muffin,1


In [5]:
basket = df.groupby(['Transaction', 'Item'])['Quantity'].sum().unstack().fillna(0)
# There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 
# and anything less the 0 is set to 0. This step will complete the one hot encoding of the data
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
frq_items = apriori(basket_sets, min_support = 0.03, use_colnames = True)
print(frq_items)

     support             itemsets
0   0.036093          (Alfajores)
1   0.324940              (Bread)
2   0.039765            (Brownie)
3   0.103137               (Cake)
4   0.475081             (Coffee)
5   0.054034            (Cookies)
6   0.038926         (Farm House)
7   0.057916      (Hot chocolate)
8   0.038296              (Juice)
9   0.061379          (Medialuna)
10  0.038191             (Muffin)
11  0.079005               (NONE)
12  0.085510             (Pastry)
13  0.071346           (Sandwich)
14  0.034309              (Scone)
15  0.034204               (Soup)
16  0.141643                (Tea)
17  0.033365              (Toast)
18  0.089393      (Coffee, Bread)
19  0.054349       (Cake, Coffee)
20  0.034939  (Medialuna, Coffee)
21  0.042073       (NONE, Coffee)
22  0.047214     (Pastry, Coffee)
23  0.037981   (Sandwich, Coffee)
24  0.049523        (Tea, Coffee)




# PROBLEM 3
Now use the `association_rules` method and pass the frequent_itemsets as input (achieved using problem 2). Use `.head()` to display the top five rules.

In [7]:
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head(5))

   antecedents consequents  antecedent support  consequent support   support  \
2  (Medialuna)    (Coffee)            0.061379            0.475081  0.034939   
6     (Pastry)    (Coffee)            0.085510            0.475081  0.047214   
4       (NONE)    (Coffee)            0.079005            0.475081  0.042073   
8   (Sandwich)    (Coffee)            0.071346            0.475081  0.037981   
0       (Cake)    (Coffee)            0.103137            0.475081  0.054349   

   confidence      lift  leverage  conviction  
2    0.569231  1.198175  0.005779    1.218561  
6    0.552147  1.162216  0.006590    1.172079  
4    0.532537  1.120938  0.004539    1.122908  
8    0.532353  1.120551  0.004086    1.122468  
0    0.526958  1.109196  0.005350    1.109667  
