In [44]:
import pandas as pd
from pathlib import Path
import re
import fasttext
import csv

In [45]:
file_path = Path('.').resolve() / 'data' / 'explo_file.xlsx'
df= pd.read_excel(file_path).dropna()
df['score'] = df['score'].astype(str)

In [46]:
def mask_numbers(value):
    return re.sub(r'[0-9]+?','q',value)

def return_product_and_price(text):
    if re.search(r'[^\-]\s\d+[,.]+?\d\d\s?[0OABC]\s',text) and len(text) > 25:
        product, price = re.findall(r'(.+)(\d+[,. ]+?\d+\s?[ABC0O]{1})$', text.strip())[0]
        formatted_price = price.strip().replace(" ",'')[:-1]
        return mask_numbers(product) + 'qqqqq', formatted_price
    return mask_numbers(text), None
df['masaked_text'] = df['text'].apply(return_product_and_price).str[0]
df['price'] = df['text'].apply(return_product_and_price).str[1]

In [47]:
from sklearn.model_selection import train_test_split
X = df['masaked_text']
Y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [48]:
# NLP Preprocessing
from gensim.utils import simple_preprocess

# Importing the dataset
dataset = pd.DataFrame({'value':X_train,'score':y_train})
ds = pd.DataFrame({'value':X_test,'score':y_test})

# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# Prefixing each row of the category column with '__label__'
dataset.iloc[:, 1] = dataset.iloc[:, 1].apply(lambda x: '__label__' + x)
ds.iloc[:, 1] = ds.iloc[:, 1].apply(lambda x: '__label__' + x)

In [49]:
# Saving the CSV file as a text file to train/test the classifier
train_path = (Path('data') / 'train.txt').resolve()
test_path = (Path('data') / 'test.txt').resolve()

dataset[['value', 'score']].to_csv(train_path, 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

ds[['value', 'score']].to_csv(test_path, 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [50]:
# Training the fastText classifier
model = fasttext.train_supervised(str(train_path), wordNgrams = 2)

# Evaluating performance on the entire test file
model.test(str(test_path))                      

(124, 0.9516129032258065, 0.9516129032258065)

In [51]:
ds['predictions'] = [model.predict(text) for text in ds['value'].str.lower().tolist()]

In [52]:
def evaluate_model(model_output):
    thresh = 0.6
    score = model_output[0][0][-1]
    accuracy = model_output[1][0]
    if score == '1' and accuracy < thresh:
        return 0
    if score == '1' and accuracy > thresh:
        return 1
    if score == '0' and accuracy < thresh:
        return 1
    if score == '0' and accuracy > thresh:
        return 0

In [53]:
ds['predicted_value'] = ds['predictions'].apply(evaluate_model)

In [54]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = ds['predictions'].apply(evaluate_model).values.reshape((-1,1))
score =  ds['score'].str.replace('__label__','').astype(int).values.reshape((-1,1))
print(classification_report(score,predictions))
print(confusion_matrix(score, predictions))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        89
           1       1.00      0.83      0.91        35

    accuracy                           0.95       124
   macro avg       0.97      0.91      0.94       124
weighted avg       0.95      0.95      0.95       124

[[89  0]
 [ 6 29]]
