In [2]:
import pandas as pd
from pycaret.classification import *

# Reading in our CSV
df = pd.read_csv("wine_data.csv")

In [None]:
# Initial Model 

# Grabbing only the columns we need
data = df[['description','wine_type']]

# Setting up the features
features = ['description']

# Using a sample of the data 
sample = data.sample(1000)

# Initializing the training environment
exp1 = setup(sample, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

In [None]:
# Accessing global environment variables
get_config('X_train_transformed')

In [None]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

In [None]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

In [None]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)

In [None]:
# Creates a basic gradio app (package to create UI for ML model) to predict model based on input of descriptors
create_app(best)

In [None]:
# Optimization Attempt #1: Adding "province"

# Grabbing only the columns we need
data2 = df[['description','wine_type','province']].dropna()
features = ['description','province']

# Using a sample of the data 
sample2 = data2.sample(1000)

# Initializing the training environment
exp1 = setup(sample2, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

In [None]:
# Acessing global environment variables
get_config('X_train_transformed')

In [None]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

In [None]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

In [None]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)

In [None]:
# Optimization Attempt #2: Adding "designation" and "province" as features 

# Grabbing only the columns we need
data3 = df[['description','wine_type','province','designation']].dropna()
features = ['description','province','designation']

# Using a sample of the data 
sample3 = data3.sample(1000)

# Initializing the training environment
exp1 = setup(sample3, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

In [None]:
# Acessing global environment variables
get_config('X_train_transformed')

In [None]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

In [None]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

In [None]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)

In [None]:
# Optimization Attempt #3: Using all 90k rows of data with "designation" and "province" as features 
data4 = df[['description','wine_type','province','designation']].dropna()
features = ['description','province','designation']

# Initializing the training environment
exp1 = setup(data4, target='wine_type',text_features=features,text_features_method='tf-idf',session_id=123)

In [None]:
# Acessing global environment variables
get_config('X_train_transformed')

In [None]:
# Trains and evaluates performance of all models in the model library
best = compare_models(fold=5)

In [None]:
# Displays a user interface for analyzing performance of a trained model
evaluate_model(best)

In [None]:
# Outputs the score of the best model (i.e.  accuracy, recall, precision, F1)
holdout_pred = predict_model(best)