In [1]:
import pandas as pd
from pycaret.classification import *

# Reading in our CSV
df = pd.read_csv("wine_data.csv")

In [2]:
# Initial Model 

# Grabbing only the columns we need
data = df[['description','wine_type']]

# Setting up the features
features = ['description']

# Using a sample of the data 
sample = data.sample(1000)

# Initializing the training environment
exp1 = setup(sample, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,wine_type
2,Target type,Multiclass
3,Target mapping,"other: 0, red: 1, white: 2"
4,Original data shape,"(1000, 2)"
5,Transformed data shape,"(1000, 3553)"
6,Transformed train set shape,"(700, 3553)"
7,Transformed test set shape,"(300, 3553)"
8,Text features,1
9,Preprocess,True


In [3]:
# Accessing global environment variables
get_config('X_train_transformed')

Unnamed: 0,description_000,description_10,description_100,description_11,description_115,description_12,description_13,description_14,description_15,description_16,...,description_zin,description_zinfandel,description_zing,description_zingy,description_zins,description_zippy,description_zonin,description_zweigelt,description_église,description_émilion
120910,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30672,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59930,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8438,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2159,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39478,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94104,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33475,0.202992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92252,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8957,0.0,0.8957,0.8557,0.8746,0.7775,0.7834,1.178
et,Extra Trees Classifier,0.8943,0.9678,0.8943,0.8555,0.8734,0.7743,0.7812,1.632
rf,Random Forest Classifier,0.8886,0.9579,0.8886,0.8503,0.8676,0.7614,0.7691,1.566
gbc,Gradient Boosting Classifier,0.8771,0.957,0.8771,0.8629,0.8648,0.7417,0.7464,6.054
svm,SVM - Linear Kernel,0.8686,0.0,0.8686,0.8543,0.8515,0.7213,0.7316,1.02
lightgbm,Light Gradient Boosting Machine,0.8643,0.9392,0.8643,0.827,0.845,0.7154,0.7188,2.076
lr,Logistic Regression,0.86,0.9691,0.86,0.8264,0.8367,0.691,0.7091,5.054
knn,K Neighbors Classifier,0.8086,0.8873,0.8086,0.7845,0.7919,0.5892,0.5984,3.012
dt,Decision Tree Classifier,0.8029,0.814,0.8029,0.7992,0.7984,0.6019,0.6065,1.096
nb,Naive Bayes,0.7857,0.7487,0.7857,0.7528,0.7625,0.5268,0.5425,1.09


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [5]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [6]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.88,0,0.88,0.8387,0.8589,0.7475,0.7513


In [7]:
# Creates a basic gradio app (package to create UI for ML model) to predict model based on input of descriptors
create_app(best)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [8]:
# Optimization Attempt #1: Adding "province"

# Grabbing only the columns we need
data2 = df[['description','wine_type','province']].dropna()
features = ['description','province']

# Using a sample of the data 
sample2 = data2.sample(1000)

# Initializing the training environment
exp1 = setup(sample2, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,wine_type
2,Target type,Multiclass
3,Target mapping,"other: 0, red: 1, white: 2"
4,Original data shape,"(1000, 3)"
5,Transformed data shape,"(1000, 3538)"
6,Transformed train set shape,"(700, 3538)"
7,Transformed test set shape,"(300, 3538)"
8,Text features,2
9,Preprocess,True


In [9]:
# Acessing global environment variables
get_config('X_train_transformed')

Unnamed: 0,description_000,description_04,description_10,description_100,description_11,description_12,description_13,description_14,description_15,description_150,...,province_veneto,province_verde,province_victoria,province_vinho,province_virginia,province_wachau,province_waipara,province_washington,province_wellington,province_york
1208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.728305,0.0,0.685254,0.0,0.0,0.0,0.0,0.0,0.0
20944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
55064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
96381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
17017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
8462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
33432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
109539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.91,0.0,0.91,0.9074,0.9033,0.8075,0.8113,1.186
ridge,Ridge Classifier,0.9071,0.0,0.9071,0.8975,0.8964,0.7993,0.8053,1.22
et,Extra Trees Classifier,0.9029,0.9779,0.9029,0.8716,0.8824,0.7869,0.7945,1.63
gbc,Gradient Boosting Classifier,0.8943,0.9773,0.8943,0.8905,0.8857,0.7713,0.7781,6.346
rf,Random Forest Classifier,0.8871,0.9755,0.8871,0.8467,0.8646,0.7504,0.7601,1.546
lr,Logistic Regression,0.8743,0.9698,0.8743,0.8722,0.8583,0.718,0.7352,1.756
lightgbm,Light Gradient Boosting Machine,0.8729,0.9557,0.8729,0.841,0.8552,0.7294,0.7325,2.168
dt,Decision Tree Classifier,0.8386,0.8396,0.8386,0.8376,0.8355,0.6638,0.6669,1.344
nb,Naive Bayes,0.7857,0.7437,0.7857,0.7543,0.7595,0.512,0.534,1.236
knn,K Neighbors Classifier,0.7786,0.8659,0.7786,0.7752,0.7636,0.5063,0.5232,1.452


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [11]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [12]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.9167,0,0.9167,0.9114,0.9127,0.825,0.8256


In [19]:
# Optimization Attempt #2: Adding "designation" and "province" as features 

# Grabbing only the columns we need
data3 = df[['description','wine_type','province','designation']].dropna()
features = ['description','province','designation']

# Using a sample of the data 
sample3 = data3.sample(1000)

# Initializing the training environment
exp1 = setup(sample3, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,wine_type
2,Target type,Multiclass
3,Target mapping,"other: 0, red: 1, white: 2"
4,Original data shape,"(10000, 4)"
5,Transformed data shape,"(10000, 15634)"
6,Transformed train set shape,"(7000, 15634)"
7,Transformed test set shape,"(3000, 15634)"
8,Text features,3
9,Preprocess,True


In [20]:
# Acessing global environment variables
get_config('X_train_transformed')

Unnamed: 0,description_000,description_03,description_04,description_05,description_06,description_07,description_08,description_08s,description_09,description_10,...,designation_âme,designation_édition,designation_élevéen,designation_élu,designation_éléments,designation_équinoxe,designation_étoile,designation_öküzgözü,designation_öreg,designation_ürziger
49115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7466,0.7903,0.7466,0.7442,0.7428,0.6909,0.692,14.662
dummy,Dummy Classifier,0.6207,0.5,0.6207,0.3853,0.4755,0.0,0.0,1.404
lightgbm,Light Gradient Boosting Machine,0.5679,0.5946,0.5679,0.5665,0.5665,0.5352,0.5355,7.494
svm,SVM - Linear Kernel,0.3764,0.0,0.3764,0.3759,0.3759,0.3529,0.353,2.748
ridge,Ridge Classifier,0.3733,0.0,0.3733,0.3723,0.3722,0.3459,0.3462,6.122
gbc,Gradient Boosting Classifier,0.3714,0.3944,0.3714,0.3701,0.3701,0.342,0.3423,113.962
rf,Random Forest Classifier,0.3669,0.3947,0.3669,0.359,0.3582,0.3314,0.333,4.414
knn,K Neighbors Classifier,0.3157,0.3407,0.3157,0.3144,0.3127,0.2229,0.2261,6.56
nb,Naive Bayes,0.2957,0.2948,0.2957,0.2933,0.2935,0.1884,0.1886,3.234
et,Extra Trees Classifier,0.186,0.1976,0.186,0.1867,0.1834,0.1712,0.1718,5.892


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [24]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [25]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9393,0.9894,0.9393,0.9366,0.936,0.8767,0.8776


In [18]:
# Optimization Attempt #3: Using all 90k rows of data with "designation" and "province" as features 
data4 = df[['description','wine_type','province','designation']].dropna()
features = ['description','province','designation']

# Initializing the training environment
exp1 = setup(data4, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

MemoryError: Unable to allocate 9.47 GiB for an array with shape (19681, 64553) and data type float64

In [None]:
# Acessing global environment variables
get_config('X_train_transformed')

In [None]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

In [None]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

In [None]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)