In [1]:
import pandas as pd
from pathlib import Path
import re
import fasttext
import csv

# Data import and introduction

In [2]:
file_path = Path('.').resolve() / 'data' / 'recipt_content.xlsx'
recipe_content = pd.read_excel(file_path).dropna()
df = recipe_content.copy()
df['score'] = df['score'].astype(str)
zeros, ones = df['score'].value_counts().values
print(f'Percent of "ones" in dataset {100*ones/zeros:.2f}%')

Percent of "ones" in dataset 36.36%


In [3]:
df.head()

Unnamed: 0,text,score
0,Biscuska Hinc Spółka Jawna,0.0
1,ul. US. Rosochy 47,0.0
2,27-Www ustrowiec Świętokrzyski,0.0
3,lipieka Gemini,0.0
4,ul. Smugowa 6,0.0


# Data manipulation and number masking

In [4]:
# extract price to another column bc it can be usefull later on, and mask all numbers with 'q' as i found it, as one of best letters to use here
def mask_numbers(value):
    return re.sub(r'[0-9]+?','q',value)

def return_product_and_price(text):
    if re.search(r'\d+[,.]+?\d\d\s?[0OABC]\s',text) and len(text) > 25:
        product, price = re.findall(r'(.*?)(\d+[,. ]+?\d+\s?[ABC0O]{1})$', text.strip())[0]
        formatted_price = price.strip().replace(" ",'')[:-1]
        return mask_numbers(text), formatted_price
    return mask_numbers(text), None
    
df['masaked_text'] = df['text'].apply(return_product_and_price).str[0]
df['price'] = df['text'].apply(return_product_and_price).str[1]

In [5]:
df.head(5)

Unnamed: 0,text,score,masaked_text,price
0,Biscuska Hinc Spółka Jawna,0.0,Biscuska Hinc Spółka Jawna,
1,ul. US. Rosochy 47,0.0,ul. US. Rosochy qq,
2,27-Www ustrowiec Świętokrzyski,0.0,qq-Www ustrowiec Świętokrzyski,
3,lipieka Gemini,0.0,lipieka Gemini,
4,ul. Smugowa 6,0.0,ul. Smugowa q,


# Model setup and data split

In [6]:
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess
X = df[['masaked_text','text']]
Y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

dataset = pd.DataFrame({'text':X_train['masaked_text'],'score':y_train})
ds = pd.DataFrame({'text':X_test['masaked_text'],'score':y_test})

# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# Prefixing each row of the category column with '__label__'
dataset.iloc[:, 1] = dataset.iloc[:, 1].apply(lambda x: '__label__' + x)
ds.iloc[:, 1] = ds.iloc[:, 1].apply(lambda x: '__label__' + x)

In [7]:
# Saving the dataframes as a text file to train/test the classifier
train_path = (Path('data') / 'train.txt').resolve()
test_path = (Path('data') / 'test.txt').resolve()

dataset[['text', 'score']].to_csv(train_path, 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

ds[['text', 'score']].to_csv(test_path, 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [8]:
# Training the fastText classifier
model = fasttext.train_supervised(str(train_path), wordNgrams = 2)

# Evaluating performance on the entire test file
model.test(str(test_path))                      

(169, 0.9467455621301775, 0.9467455621301775)

# Model results

In [9]:
ds['predicted_value'] = [model.predict(text)[0][0] for text in ds['text'].tolist()]
ds[['score','predicted_value']] = ds[['score','predicted_value']].applymap(lambda x: float(x.removeprefix('__label__')))
ds

Unnamed: 0,text,score,predicted_value
398,qqqqq qqqqqqqq kierownik,0.0,0.0
303,suma ptu qq qq,0.0,0.0
675,karta millennium debit psn qq,0.0,0.0
543,curaorbit bag qq qqg,1.0,0.0
379,,0.0,0.0
...,...,...,...
328,,0.0,0.0
450,sprzedaz opodatkohana qq qq,0.0,0.0
390,beata teresa sildak,0.0,0.0
104,igor,0.0,0.0


In [10]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = ds['predicted_value'].values.reshape((-1,1))
score =  ds['score'].values.reshape((-1,1))
print(classification_report(score,predictions))
print(confusion_matrix(score, predictions))

              precision    recall  f1-score   support

         0.0       0.95      0.98      0.97       129
         1.0       0.94      0.82      0.88        40

    accuracy                           0.95       169
   macro avg       0.95      0.90      0.92       169
weighted avg       0.95      0.95      0.95       169

[[127   2]
 [  7  33]]


Model efficiency is quite good but lets test simpler approach

In [11]:
# use of custom function to determine whather element has product properties or not
def is_product(text):
    if re.search(r'\s\d+[,.]+?\d\d\s?[0OABC846]\s', text) and len(text) > 25:
        return 1
    return 0

X_test['determined_by_re'] = X_test['text'].apply(is_product)
print(classification_report(y_test.astype(float).astype(int),X_test['determined_by_re']))
print(confusion_matrix(y_test.astype(float).astype(int),X_test['determined_by_re']))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       129
           1       1.00      0.95      0.97        40

    accuracy                           0.99       169
   macro avg       0.99      0.97      0.98       169
weighted avg       0.99      0.99      0.99       169

[[129   0]
 [  2  38]]
