In [1]:
import os

import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from TextPreProcessing.preProcessing import TextPreProcessing

# Import Data

In [2]:
df = pd.read_csv('data/Womens Clothing E-Commerce Reviews.csv')
df = df.head(10)

# Split

In [3]:
train, test = train_test_split(df,test_size = 0.3,random_state=7)
test, val = train_test_split(test,test_size=0.5,random_state=7)

train.reset_index(inplace = True)
val.reset_index(inplace = True)
test.reset_index(inplace = True)

print(train.shape)
print(val.shape)
print(test.shape)

(7, 12)
(2, 12)
(1, 12)


In [4]:
del df

# Text Preprocessing 

In [5]:
pre_processing_train = TextPreProcessing(sentences= train['Review Text'])
pre_processing_val = TextPreProcessing (sentences= val['Review Text'])
pre_processing_test = TextPreProcessing (sentences= test['Review Text'])

pre_processing_train = pd.Series(pre_processing_train.preprocessing())
pre_processing_val = pd.Series(pre_processing_val.preprocessing())
pre_processing_test = pd.Series(pre_processing_test.preprocessing())

In [6]:
df_train = pd.DataFrame({'features': pre_processing_train,
                        'label': train['Recommended IND']})

df_val = pd.DataFrame({'features': pre_processing_val,
                        'label': val['Recommended IND']})

df_test = pd.DataFrame({'features': pre_processing_test,
                        'label': test['Recommended IND']})

In [7]:
df_train = df_train[(df_train['features'] != np.nan) & (df_train['features'] != '- E M P T Y -')]

df_val = df_val[(df_val['features'] != np.nan) & (df_val['features'] != '- E M P T Y -')]

df_test = df_test[(df_test['features'] != np.nan) & (df_test['features'] != '- E M P T Y -')]

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [8]:
del pre_processing_train, pre_processing_val, pre_processing_test
del train, test, val

# TF-IDF

In [9]:

tfidf_vectorizer = TfidfVectorizer()

bow_train = tfidf_vectorizer.fit_transform(df_train['features'])
bow_val = tfidf_vectorizer.transform(df_val['features'])
bow_test = tfidf_vectorizer.transform(df_test['features'])

# Training with MLflow

In [10]:
import mlflow

from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn import svm

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, make_scorer

In [11]:
os.system("mlflow server \
    --backend-store-uri sqlite:///mlflow.db \
    --default-artifact-root ./artifacts \
    --host 127.0.0.1:5000")

In [11]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.set_tracking_uri("sqlite:///mlflow_tracking.sqlite")

In [12]:
experiment = "Bag of Words"
try:
    mlflow.create_experiment(experiment)
    mlflow.set_experiment(experiment)
except:
    mlflow.set_experiment(experiment)

# Naive Bayes

In [13]:
run_name = 'NaiveBayes'
with mlflow.start_run(run_name= run_name):
    model = MultinomialNB()
    model.fit(bow_train,df_train['label'])

    train_pred = model.predict(bow_train)
    val_pred = model.predict(bow_val)
    test_pred = model.predict(bow_test)

    mlflow.log_metric('f1 score - train', f1_score(df_train['label'], train_pred))
    mlflow.log_metric('precision - train', precision_score(df_train['label'], train_pred))
    mlflow.log_metric('recall - train', recall_score(df_train['label'], train_pred))
    mlflow.log_metric('accuracy - train', accuracy_score(df_train['label'], train_pred))

    mlflow.sklearn.log_model(model,"NaiveBayes")

# SVM

In [14]:
run_name = 'SVM_Classifier'
parans = {
    'C':[0.1,1,10,100],
    'kernel':['linear', 'rbf', 'poly'],
    'degree':[2]
    }
with mlflow.start_run(run_name= run_name):
    model = GridSearchCV(estimator=svm.SVC(), 
                    param_grid=parans, 
                    scoring=['f1',
                            'precision',
                            'recall',
                            'accuracy'],
                    refit='f1')

    model.fit(bow_train,df_train['label'])

    train_pred = model.best_estimator_.predict(bow_train)
    val_pred = model.best_estimator_.predict(bow_val)
    test_pred = model.best_estimator_.predict(bow_test)

    metrics_train = {
        "f1-train": f1_score(df_train['label'], train_pred),
        "precision-train": precision_score(df_train['label'], train_pred),
        "recall-train": recall_score(df_train['label'], train_pred),
        "accuracy-train": accuracy_score(df_train['label'], train_pred)
    }
    mlflow.log_metrics(metrics_train)


    metrics_val = {
        "f1-val": f1_score(df_val['label'], val_pred),
        "precision-val": precision_score(df_val['label'], val_pred),
        "recall-val": recall_score(df_val['label'], val_pred),
        "accuracy-val": accuracy_score(df_val['label'], val_pred)
    }
    mlflow.log_metrics(metrics_val)

    metrics_test = {
        "f1-test": f1_score(df_test['label'], test_pred),
        "precision-test": precision_score(df_test['label'], test_pred),
        "recall-test": recall_score(df_test['label'], test_pred),
        "accuracy-test": accuracy_score(df_test['label'], test_pred)
    }
    mlflow.log_metrics(metrics_test)    
    mlflow.log_params(model.best_params_)


    mlflow.sklearn.log_model(model.best_estimator_, 'svm')

# XGBoost

In [15]:
run_name = 'XGBoost_Classifier'
parans = {
    'learning_rate' : [0.01],
    'max_depth' : [5,10], 
    'n_estimators' : [200,500],
    'objective' : ['binary:logistic'],
    'eval_metric':['mlogloss'],
    'seed' : [42],
    'reg_lambda' : [5,8],
    'reg_alpha' : [2,4],
    'gamma' : [3,5],
    'subsample': [0.4,0.8],
    }
with mlflow.start_run(run_name= run_name):
    model = GridSearchCV(estimator=xgb.XGBClassifier(), 
                    param_grid=parans, 
                    scoring=['f1',
                            'precision',
                            'recall',
                            'accuracy'],
                    refit='f1')

    model.fit(bow_train,df_train['label'])


    train_pred = model.best_estimator_.predict(bow_train)
    val_pred = model.best_estimator_.predict(bow_val)
    test_pred = model.best_estimator_.predict(bow_test)

    metrics_train = {
        "f1-train": f1_score(df_train['label'], train_pred),
        "precision-train": precision_score(df_train['label'], train_pred),
        "recall-train": recall_score(df_train['label'], train_pred),
        "accuracy-train": accuracy_score(df_train['label'], train_pred)
    }
    mlflow.log_metrics(metrics_train)


    metrics_val = {
        "f1-val": f1_score(df_val['label'], val_pred),
        "precision-val": precision_score(df_val['label'], val_pred),
        "recall-val": recall_score(df_val['label'], val_pred),
        "accuracy-val": accuracy_score(df_val['label'], val_pred)
    }
    mlflow.log_metrics(metrics_val)

    metrics_test = {
        "f1-test": f1_score(df_test['label'], test_pred),
        "precision-test": precision_score(df_test['label'], test_pred),
        "recall-test": recall_score(df_test['label'], test_pred),
        "accuracy-test": accuracy_score(df_test['label'], test_pred)
    }
    mlflow.log_metrics(metrics_test)    
    mlflow.log_params(model.best_params_)

    mlflow.sklearn.log_model(model.best_estimator_,'xgboost')

# Load Models

In [19]:
# nb = mlflow.sklearn.load_model('models:/NaiveBayes/Staging')
# svm = mlflow.sklearn.load_model('models:/svm/Staging')
xgb = mlflow.sklearn.load_model('models:/xgboost/staging')

# Deploy with mlflow API

In [27]:
"""
mlflow models serve --host 0.0.0.0 -p 8001 -m './artifacts/1/7df17ebcdb2749eab45b80ee0036a6a0/artifacts/xgboost' --no-conda
"""

1


In [None]:
mlflow_women_clothing/artifacts/1/7df17ebcdb2749eab45b80ee0036a6a0/artifacts/xgboost