In [None]:
import pandas as pd
import warnings 
import re 

warnings.filterwarnings('ignore')
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.data.path.append("../nltk_data/")
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [None]:
ec = pd.read_csv("../final_data/ec.csv") #main ec
ingre = pd.read_csv("../final_data/ingredient.csv") #ingredient ec

In [None]:
ec = ec.iloc[:,:2]

In [None]:
ec = ec.rename(columns={ec.columns[1]: 'ec_description', ec.columns[0]: 'ec_code'})

In [None]:
ingre = ingre.rename(columns = {ingre.columns[1]:"ec_description", ingre.columns[0]:'ec_code'})

In [None]:
class Preprocessing():
    
    def __init__(self,text):
        
        self.text = text
    
    def text_lowercase(self):
        self.new_text = self.text.lower()
        return self.new_text
    
    def remove_numbers(self):
        self.new_text = re.sub('[^A-Za-z0-9]+', ' ', self.new_text)
        return self.new_text
    
    def remove_punctuation(self):
        translator = str.maketrans("","", string.punctuation)
        slef.new_text = self.new_text.translate(translator)
        return self.new_text
    
    def remove_characters(self):
        self.new_text = re.sub('[^A-Za-z0-9]+', ' ', self.new_text)
        return self.new_text
    
    def remove_letters(self):
        stopwords_ = stopwords.words('english')+['rfg', 'regular',"label","private"]
        self.new_text = [i for i in self.new_text if len(i)> 2 if i not in stopwords_]
        return self.new_text
    
    def lemmatize(self):
        self.new_text = [lemmatizer.lemmatize(token, 'v') for token in self.new_text]
        return self.new_text
    
    def remove_words(self):
        stopwords_ = stopwords.words('english')+['rfg', 'regular',"label","private"]
        self.new_text = [token for token in self.new_text if token not in stopwords_]
        return self.new_text

    def tokenize(self):
        try:
            self.new_text = self.new_text.split(" ")
        except:
            self.new_text = []
        return self.new_text
    
def create_tokens(phrase):
    pp = Preprocessing(phrase)
    
    pp.text_lowercase()
    pp.remove_numbers()
    pp.remove_characters()
    pp.tokenize()
    pp.remove_letters()
    pp_tokens = pp.lemmatize()
    pp_tokens = " ".join(pp_tokens)
    return pp_tokens

In [None]:
# Concatenate the ingredient table to the main table
# Clean the text to keep only numbers and lowercase letters
ec_cleaned = pd.concat([ec, ingre], axis=0)
from tqdm import tqdm
ec_results = []

for w in tqdm(ec_cleaned["ec_description"]):
    ec_results.append(create_tokens(w))
ec_cleaned["ec_description"] = ec_results
# Some food descriptions are different across the years. They will be dropped here for now.
# This also removes duplicate ingredient records
ec_cleaned = ec_cleaned.drop_duplicates('ec_code')

In [None]:
ppc = pd.read_csv("../final_data/ppc20172018_publictest.csv") #ppc
valid_ec = set(ppc['ec'].tolist())

In [None]:
# Here we don't need to specifically filter out negative EC codes because they don't exist in EC table at the first place
ec_cleaned = ec_cleaned.loc[ec_cleaned['ec_code'].isin(valid_ec)]

In [None]:
ec_cleaned.to_csv('../final_data/1718ec_cleaned.csv', index=False)