In [1]:
import pandas as pd
from pathlib import Path
import re
import fasttext
import csv

ModuleNotFoundError: No module named 'fasttext'

# Data import and introduction

In [None]:
file_path = Path('.').resolve() / 'data' / 'recipt_content.xlsx'
recipe_content = pd.read_excel(file_path).dropna()
df = recipe_content.copy()
df['score'] = df['score'].astype(str)
zeros, ones = df['score'].value_counts().values
print(f'Percent of "ones" in dataset {100*ones/zeros:.2f}%')

In [None]:
df.head()

# Data manipulation and number masking

In [None]:
# extract price to another column bc it can be usefull later on, and mask all numbers with 'q' as i found it, as one of best letters to use here
def mask_numbers(value):
    return re.sub(r'[0-9]+?','q',value)

def return_product_and_price(text):
    if re.search(r'\d+[,.]+?\d\d\s?[0OABC]\s',text) and len(text) > 25:
        product, price = re.findall(r'(.*?)(\d+[,. ]+?\d+\s?[ABC0O]{1})$', text.strip())[0]
        formatted_price = price.strip().replace(" ",'')[:-1]
        return mask_numbers(text), formatted_price
    return mask_numbers(text), None
    
df['masaked_text'] = df['text'].apply(return_product_and_price).str[0]
df['price'] = df['text'].apply(return_product_and_price).str[1]

In [None]:
df.head(5)

# Model setup and data split

In [None]:
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess
X = df[['masaked_text','text']]
Y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

dataset = pd.DataFrame({'text':X_train['masaked_text'],'score':y_train})
ds = pd.DataFrame({'text':X_test['masaked_text'],'score':y_test})

# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# Prefixing each row of the category column with '__label__'
dataset.iloc[:, 1] = dataset.iloc[:, 1].apply(lambda x: '__label__' + x)
ds.iloc[:, 1] = ds.iloc[:, 1].apply(lambda x: '__label__' + x)

In [None]:
# Saving the dataframes as a text file to train/test the classifier
train_path = (Path('data') / 'train.txt').resolve()
test_path = (Path('data') / 'test.txt').resolve()

dataset[['text', 'score']].to_csv(train_path, 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

ds[['text', 'score']].to_csv(test_path, 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [None]:
# Training the fastText classifier
model = fasttext.train_supervised(str(train_path), wordNgrams = 2)

# Evaluating performance on the entire test file
model.test(str(test_path))                      

# Model results

In [None]:
ds['predicted_value'] = [model.predict(text)[0][0] for text in ds['text'].tolist()]
ds[['score','predicted_value']] = ds[['score','predicted_value']].applymap(lambda x: float(x.removeprefix('__label__')))
ds

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = ds['predicted_value'].values.reshape((-1,1))
score =  ds['score'].values.reshape((-1,1))
print(classification_report(score,predictions))
print(confusion_matrix(score, predictions))

Model efficiency is quite good but lets test simpler approach

In [None]:
# use of custom function to determine whather element has product properties or not
def is_product(text):
    if re.search(r'\s\d+[,.]+?\d\d\s?[0OABC846]\s', text) and len(text) > 25:
        return 1
    return 0

X_test['determined_by_re'] = X_test['text'].apply(is_product)
print(classification_report(y_test.astype(float).astype(int),X_test['determined_by_re']))
print(confusion_matrix(y_test.astype(float).astype(int),X_test['determined_by_re']))