In [None]:
import pandas as pd
import warnings 
import re

warnings.filterwarnings('ignore')
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.data.path.append("../data/nltk_data/")
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [None]:
# The table keeps only records in 2017 and 2018
upc = pd.read_csv("../final_data/pd_pos_all1718_public.csv") #upc

In [None]:
upc.drop(upc.columns[0], inplace = True, axis = 1)

In [None]:
ppc = pd.read_csv("../final_data/ppc20172018_publictest.csv") #ppc
ppc.drop(ppc.columns[0], inplace = True, axis = 1)
ppc.sample(3)

In [None]:
ppc = ppc.loc[(ppc['ec'] != '-70') & (ppc['ec'] != '-90')]

In [None]:
valid_upc = set(ppc['upc'].tolist())

In [None]:
# Keep only the UPCs that appear in PPC table.
upc = upc.loc[upc['upc'].isin(valid_upc)]

In [None]:
# All UPC description has UPC code as suffix, which needs to be removed
upc['upc_description'] = upc['upcdesc'].str.split('-').str[0]

In [None]:
upc = upc.fillna('')

In [None]:
# More text columns could be combined to the descriptions together

upc['deptid'] = upc['deptid'].str.split('-').str[1]
upc['aisle'] = upc['aisle'].str.split('-').str[1]
upc = upc.astype({"flavor" : str, "deptid" : str, "aisle": str,'category' : str, 'brand':str, 'manufacturer':str, 'parent':str})
column_list = ['deptid', 'aisle', 'category']




for column in column_list:
    upc['upc_description'] = upc['upc_description'] + ' ' + upc[column]

In [None]:
class Preprocessing():
    
    def __init__(self,text):
        
        self.text = text
    
    def text_lowercase(self):
        self.new_text = self.text.lower()
        return self.new_text
    
    def remove_numbers(self):
        self.new_text = re.sub('[^A-Za-z0-9]+', ' ', self.new_text)
        return self.new_text
    
    def remove_punctuation(self):
        translator = str.maketrans("","", string.punctuation)
        slef.new_text = self.new_text.translate(translator)
        return self.new_text
    
    def remove_characters(self):
        self.new_text = re.sub('[^A-Za-z0-9]+', ' ', self.new_text)
        return self.new_text
    
    def remove_letters(self):
        stopwords_ = stopwords.words('english')+['rfg', 'regular',"label","private"]
        self.new_text = [i for i in self.new_text if len(i)> 2 if i not in stopwords_]
        return self.new_text
    
    def lemmatize(self):
        self.new_text = [lemmatizer.lemmatize(token, 'v') for token in self.new_text]
        return self.new_text
    
    def remove_words(self):
        stopwords_ = stopwords.words('english')+['rfg', 'regular',"label","private"]
        self.new_text = [token for token in self.new_text if token not in stopwords_]
        return self.new_text

    def tokenize(self):
        try:
            self.new_text = self.new_text.split(" ")
        except:
            self.new_text = []
        return self.new_text
    
def create_tokens(phrase):
    pp = Preprocessing(phrase)
    
    pp.text_lowercase()
    pp.remove_numbers()
    pp.remove_characters()
    pp.tokenize()
    pp.remove_letters()
    pp_tokens = pp.lemmatize()
    pp_tokens = " ".join(pp_tokens)
    return pp_tokens

In [None]:
from tqdm import tqdm
upc_results = []

for w in tqdm(upc["upc_description"]):
    upc_results.append(create_tokens(w))

In [None]:
# Keep only the code and description for now
upc_cleaned = upc[['upc', 'upc_description']]
upc_cleaned['upc_description'] = upc_results

In [None]:
# This is the description field after cleaning
upc_cleaned.iloc[0]['upc_description']

In [None]:
# Rename the column for consistency
upc_cleaned = upc_cleaned.rename(columns={'upc': 'upc_code'})

In [None]:
len(upc_cleaned["upc_code"].unique())

In [None]:
# Some food descriptions are different across the years. They will be dropped here for now. 
# By default, 2015 record will be dropped
upc_cleaned = upc_cleaned.drop_duplicates('upc_code')

In [None]:
# Output the table
upc_cleaned.to_csv('../final_data/1718upc_cleaned.csv', index=False)

In [None]:
ppc.to_csv('../final_data/1718ppc_cleaned.csv', index=False)