In [1]:
# Linear algebra
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import nltk

from tqdm import tqdm

In [2]:
ADDITIONAL_STOPWORDS = ['covfefe']

# N-grams extraction

## Load Data

In [3]:
# Load the whole clean dataset
file ='../raw_data/ocr_labeled.csv'
off_df_base = pd.read_csv(file)

In [4]:
# Deep copy of the dataframe to avoid to reload it
df = off_df_base.copy()  # Renew the DataFrame

In [14]:
# Output final
    # Un pattern
    # Type de pattern
    # OCR dans lequel il est présent
    # Catégorie
    # Occurence dans chaque catégorie
    # Occurence totale

# Pour chaque OCR Clean
    # Convertir en liste de mots
    # Extraire les unigrams et leur occurences

## N_grams extraction function

In [7]:
def get_ngrams(text, gram_size=1):
    '''Generates a list of n-grams for a given text with a given n_gram_size
       Returns a tuple: (n_gram_size, list of n_grams)'''
    
    # convert ocr to list
    words = text.split()
    
    # extract n_grams
    patterns = list(nltk.ngrams(words, gram_size))
    
    # return n_grams strings instead of tuples              
    if gram_size == 1:
        n_grams = [p[0] for p in patterns]
    else: n_grams = [
                    ' '.join(p[i] for i in range(len(p))) \
                    for p in patterns
                    ]    

    # return gram_size and list of n_grams
    return (gram_size, n_grams)


#df['patterns'] = df.clean_text.head(2).apply(lambda x: get_ngrams(x, gram_size=1))

## Generation of the DataFrame

In [8]:
indexes = []
patterns = []
n_gram_size = []

# Iterate over each OCR
for i,text in tqdm(df['clean_text'].iteritems(), total=df.shape[0]):
    
    # Initialize a vocab length
    len_vocab = 0

    # Iterate over each n_gram size
    for k in range(10):
        results = get_ngrams(text, gram_size=k+1)  # extract n_grams
        patterns.extend(results[1])                # store them in patterns
        len_vocab = len_vocab + len(results[1])    # store number of patterns
        n_gram_size.extend([f'{k+1}-grams'] * len(results[1])) # store size
    indexes.extend([i] * len_vocab)

100%|██████████| 434896/434896 [05:48<00:00, 1249.32it/s]


In [9]:
len(indexes)

235933703

In [10]:
# Convert list to pd.Series
print('start')
indexes = pd.Series(indexes)
print('indexes')
patterns = pd.Series(patterns)
print('patterns')
n_gram_size = pd.Series(n_gram_size)
print('n_gram_size')

start
indexes
patterns
n_gram_size


In [11]:
# Initialize a DataFrame
data = pd.DataFrame()

In [12]:
indexes.head(1)

0    0
dtype: int64

In [14]:
# Create the Pattern DataFrame
data[['index','size','pattern']] = pd.concat([indexes, n_gram_size, patterns], axis=1)

In [17]:
data.head(1)

Unnamed: 0_level_0,size,pattern
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1-grams,ne


In [19]:
# Retrieve categories, barcode, clean_text
#data.set_index('index', inplace=True)

patterns_df = pd.DataFrame()
patterns_df = pd.merge(
    data,
    df[['pnns_groups_2','clean_text','barcode','source']],
    left_index=True,
    right_index=True,
)

In [20]:
print(patterns_df.shape)
patterns_df.tail(3)

(235933703, 6)


Unnamed: 0,size,pattern,pnns_groups_2,clean_text,barcode,source
434895,8-grams,ingredients tomate oignorn huile dolive persil...,dressings and sauces,ingredients tomate oignorn huile dolive persil...,3474320000113,/347/432/000/0113/2.json
434895,8-grams,tomate oignorn huile dolive persil sel et poivre,dressings and sauces,ingredients tomate oignorn huile dolive persil...,3474320000113,/347/432/000/0113/2.json
434895,9-grams,ingredients tomate oignorn huile dolive persil...,dressings and sauces,ingredients tomate oignorn huile dolive persil...,3474320000113,/347/432/000/0113/2.json


## Calculate frequencies

In [30]:
df.head(1)

Unnamed: 0,barcode,clean_text,fr_text,source,pnns_groups_1,pnns_groups_2
0,3199660476748,ne eleve abattu en bretagne les eleveurs de br...,NE\nELEVE\nABATTU\nEN BRETAGNE\nLES ÉLEVEURS\n...,/319/966/047/6748/1.json,fish meat eggs,meat


In [32]:
categories = sorted(list(df.pnns_groups_2.unique()))

In [41]:
ex_1 = patterns_df.head(10009).tail(2)

In [42]:
ex_2 = patterns_df.head(2)

In [73]:
ex = pd.concat([ex_1,ex_2])

In [74]:
ex

Unnamed: 0,size,pattern,pnns_groups_2,clean_text,barcode,source
20,1-grams,incorp,sweets,cons a conse dans s se cone apres avan sucre a...,3196203800022,/319/620/380/0022/5.json
20,1-grams,le,sweets,cons a conse dans s se cone apres avan sucre a...,3196203800022,/319/620/380/0022/5.json
0,1-grams,ne,meat,ne eleve abattu en bretagne les eleveurs de br...,3199660476748,/319/966/047/6748/1.json
0,1-grams,eleve,meat,ne eleve abattu en bretagne les eleveurs de br...,3199660476748,/319/966/047/6748/1.json


In [80]:
from sklearn.preprocessing import OneHotEncoder
ohen = OneHotEncoder(sparse = False)
pnns_encoded = ohen.fit_transform(ex[['pnns_groups_2']])
categories = sorted(list(ex.pnns_groups_2.unique()))
for i,cat in enumerate(categories):
    print(pnns_encoded[:,i])
    ex[f'{cat}'] = pnns_encoded[:,i]


[0. 0. 1. 1.]
[1. 1. 0. 0.]


In [84]:
ex.groupby('pattern').sum(categories)

Unnamed: 0_level_0,barcode,meat,sweets
pattern,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eleve,3199660476748,1.0,0.0
incorp,3196203800022,0.0,1.0
le,3196203800022,0.0,1.0
ne,3199660476748,1.0,0.0


## Load CSV

In [None]:
patterns_df.to_csv('../raw_data/patterns_df.csv', chunksize=100_000, compression='gzip', index=False)