In [69]:
import pandas as pd
from pathlib import Path
import re
import fasttext
import csv

# Data import and introduction

In [70]:
file_path = Path('.').resolve() / 'data' / 'recipt_content.xlsx'
recipe_content = pd.read_excel(file_path).dropna()
df = recipe_content.copy()
df['score'] = df['score'].astype(str)
zeros, ones = df['score'].value_counts().values
print(f'Percent of "ones" in dataset {100*ones/zeros:.2f}%')

Percent of "ones" in dataset 36.36%


In [71]:
df.head()

Unnamed: 0,text,score
0,Biscuska Hinc Spółka Jawna,0.0
1,ul. US. Rosochy 47,0.0
2,27-Www ustrowiec Świętokrzyski,0.0
3,lipieka Gemini,0.0
4,ul. Smugowa 6,0.0


# Data manipulation and number masking

In [72]:
def mask_numbers(value):
    return re.sub(r'[0-9]+?','q',value)

def return_product_and_price(text):
    if re.search(r'[^\-]\s\d+[,.]+?\d\d\s?[0OABC]\s',text) and len(text) > 25:
        product, price = re.findall(r'(.+)(\d+[,. ]+?\d+\s?[ABC0O]{1})$', text.strip())[0]
        formatted_price = price.strip().replace(" ",'')[:-1]
        return mask_numbers(product) + 'qqqqqq', formatted_price
    return mask_numbers(text), None
    
df['masaked_text'] = df['text'].apply(return_product_and_price).str[0]
df['price'] = df['text'].apply(return_product_and_price).str[1]

# Model setup and data split

In [73]:
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess
X = df[['masaked_text','text']]
Y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

dataset = pd.DataFrame({'text':X_train['masaked_text'],'score':y_train})
ds = pd.DataFrame({'text':X_test['masaked_text'],'score':y_test})

# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# Prefixing each row of the category column with '__label__'
dataset.iloc[:, 1] = dataset.iloc[:, 1].apply(lambda x: '__label__' + x)
ds.iloc[:, 1] = ds.iloc[:, 1].apply(lambda x: '__label__' + x)

In [74]:
# Saving the dataframes as a text file to train/test the classifier
train_path = (Path('data') / 'train.txt').resolve()
test_path = (Path('data') / 'test.txt').resolve()

dataset[['text', 'score']].to_csv(train_path, 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

ds[['text', 'score']].to_csv(test_path, 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [75]:
# Training the fastText classifier
model = fasttext.train_supervised(str(train_path), wordNgrams = 2)

# Evaluating performance on the entire test file
model.test(str(test_path))                      

(203, 0.9507389162561576, 0.9507389162561576)

# Model results

In [76]:
def evaluate_model(model_output):
    thresh = 0.9
    score = model_output[0][0][-1]
    accuracy = model_output[1][0]
    if score == '1' and accuracy > thresh:
        return 1
    if score == '1' and accuracy < thresh:
        return 0
    if score == '0' and accuracy > thresh:
        return 0
    if score == '0' and accuracy < thresh:
        return 1

ds['predictions'] = [model.predict(text) for text in ds['text'].str.lower().tolist()]
ds['predicted_value'] = ds['predictions'].apply(evaluate_model)

In [77]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = ds['predictions'].apply(evaluate_model).values.reshape((-1,1))
score =  ds['score'].str.replace('__label__','').map(lambda x: int(float(x))).values.reshape((-1,1))
print(classification_report(score,predictions))
print(confusion_matrix(score, predictions))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       151
           1       0.51      0.37      0.43        52

    accuracy                           0.75       203
   macro avg       0.66      0.62      0.63       203
weighted avg       0.73      0.75      0.73       203

[[133  18]
 [ 33  19]]


As this model is not efficient yet its maybe better to use different approach

In [78]:
# use of custom function to determine whather element has product properties or not
def is_product(text):
    if re.search(r'\s\d+[,.]+?\d\d\s?[0OABC846]\s', text) and len(text) > 25:
        return 1
    return 0

X_test['determined_by_re'] = X_test['text'].apply(is_product)
print(classification_report(y_test.astype(float).astype(int),X_test['determined_by_re']))
print(confusion_matrix(y_test.astype(float).astype(int),X_test['determined_by_re']))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       151
           1       1.00      0.96      0.98        52

    accuracy                           0.99       203
   macro avg       0.99      0.98      0.99       203
weighted avg       0.99      0.99      0.99       203

[[151   0]
 [  2  50]]
