## 0. Notebook Parameters

---

### Notebook Settings

In [None]:
"""Google Colab settings"""
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [4]:
"""Jupyter settings"""
# Enable autoreload
%load_ext autoreload
%autoreload 2

# Pylint parameters
%config Completer.use_jedi = False

# Measure Runtime
# !pip install ipython-autotime
%load_ext autotime

time: 287 µs (started: 2021-03-03 10:55:24 +01:00)


### Imported Packages

#### Packages Usually Needed

In [5]:
"""Packages for manipulation of vectors, arrays, dataframes"""
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None) # Change display settings of pandas

"""Packages for data visualization"""
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

time: 1.21 s (started: 2021-03-03 10:55:25 +01:00)


#### Packages Specific to the Notebook

In [None]:
# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['covfefe', 'dont', 'e', 'g', 'kj', 'kcal',]

import matplotlib.pyplot as plt

## 1. Calculate Frequencies of N-grams

### Import the Datasets

In [None]:
# Load the datasets of ngrams

# Datasets with more ngrams (noisier)
df_ngrams_ext_1_base = pd.read_csv('../raw_data/ngrams_extracted_1.csv')
df_ngrams_ext_2_base = pd.read_csv('../raw_data/ngrams_extracted_2.csv')

# Datasets with less ngrams 
# filtered with additional stopwords: ['e', 'g', 'kj', 'kcal', 'dont']
df_ngrams_red_1_base = pd.read_csv('../raw_data/ngrams_extracted_reduced_1.csv')
df_ngrams_red_2_base = pd.read_csv('../raw_data/ngrams_extracted_reduced_2.csv')

In [None]:
# Deep copy of the dataframe to avoid to reload it
ngrams_ext_1 = df_ngrams_ext_1_base.copy()
ngrams_ext_2 = df_ngrams_ext_2_base.copy()

In [None]:
# Deep copy of the dataframe to avoid to reload it
ngrams_red_1 = df_ngrams_red_1_base.copy()
ngrams_red_2 = df_ngrams_red_2_base.copy()

In [None]:
# Brief look at the dataset
print(f"""Shape of the dataset: {ngrams_ext_1.shape}
""")
print(f"""Columns types of the dataset: 
{ngrams_ext_1.dtypes}
""")
print(f"""Head of the dataset:""")
display(ngrams_ext_1.head())

### Concatenate the datasets

In [None]:
# Concatenate the extended datasets
ngrams_ext = pd.concat([ngrams_ext_1, ngrams_ext_2], ignore_index=True)

In [None]:
# Save ngrams_ext dataset to csv
ngrams_ext.to_csv('../raw_data/ngrams_extracted.csv', index = False)

In [None]:
print(f"""
- length of concatenated df: {len(ngrams_ext)},
- length of ngrams_ext_1: {len(ngrams_ext_1)},
- length of ngrams_ext_2: {len(ngrams_ext_2)},
""")

In [None]:
# Concatenate the reduced datasets
ngrams_reduced = pd.concat([ngrams_red_1, ngrams_red_2], ignore_index=True)

In [None]:
# Save ngrams_ext dataset to csv
#ngrams_reduced.to_csv('../raw_data/ngrams_extracted_reduced.csv', index = False)

### Calculate Frequencies

In [3]:
# Load the whole clean dataset
file ='../raw_data/ngrams_extracted_reduced.csv'
ngrams_reduced_base = pd.read_csv(file)

time: 48 ms (started: 2021-03-03 09:55:33 +01:00)


In [20]:
ngrams_reduced = ngrams_reduced_base.copy()

time: 51.6 ms (started: 2021-03-03 10:27:27 +01:00)


In [21]:
# Save ngrams_reduced dataset to csv
ngrams_reduced.to_csv('../raw_data/ngrams_reduced.csv', index = False)

time: 113 ms (started: 2021-03-03 10:27:34 +01:00)


In [5]:
# Brief look at the dataset
print(f"""Shape of the dataset: {ngrams_reduced.shape}
""")
print(f"""Columns types of the dataset: 
{ngrams_reduced.dtypes}
""")
print(f"""Head of the dataset:""")
display(ngrams_reduced.head())

Shape of the dataset: (38867, 12)

Columns types of the dataset: 
n_gram_size                object
pattern                    object
global_occurences           int64
fish meat eggs              int64
sugary snacks               int64
cereals and potatoes        int64
milk and dairy products     int64
fat and sauces              int64
fruits and vegetables       int64
salty snacks                int64
beverages                   int64
composite foods             int64
dtype: object

Head of the dataset:


Unnamed: 0,n_gram_size,pattern,global_occurences,fish meat eggs,sugary snacks,cereals and potatoes,milk and dairy products,fat and sauces,fruits and vegetables,salty snacks,beverages,composite foods
0,1-grams,"('sucre',)",268175,23256,85227,23156,34745,11009,23536,14603,19897,32746
1,1-grams,"('sel',)",262603,44526,38631,22952,22495,14217,16096,25072,8107,70507
2,1-grams,"('ingredient',)",175998,23229,38021,18434,19928,8964,15270,13398,13272,25482
3,1-grams,"('gras',)",153330,22344,26717,14921,18967,9457,10093,18211,8582,24038
4,1-grams,"('lait',)",152570,7755,41706,9879,58044,3496,1572,6425,1115,22578


time: 13.6 ms (started: 2021-03-03 09:55:35 +01:00)


In [6]:
ngrams_reduced.columns

Index(['n_gram_size', 'pattern', 'global_occurences', 'fish meat eggs',
       'sugary snacks', 'cereals and potatoes', 'milk and dairy products',
       'fat and sauces', 'fruits and vegetables', 'salty snacks', 'beverages',
       'composite foods'],
      dtype='object')

time: 2.22 ms (started: 2021-03-03 09:55:44 +01:00)


In [7]:
#Calculate frequencies of occurences of pattern per categories
categories_PNNS_1 = ['fish meat eggs', 'sugary snacks', 'cereals and potatoes', 'milk and dairy products', 'fat and sauces', 'fruits and vegetables', 'salty snacks', 'beverages', 'composite foods']

time: 501 µs (started: 2021-03-03 09:55:45 +01:00)


In [8]:
# Calculate frequencies of patterns per categories

def loop_frequencies_calculation(df):
    """Calculate relative frequencies of each pattern per category"""

    # Loop the operation over each category in the df
    for cat in categories_PNNS_1:
        # Loop over each row of the category
        cat_frequencies = []

        for i in range(len(df)):
            row = df.iloc[i]
            # Calculate the relative frequency of the pattern in this category
            # -> Divide its number of occurences in this category,
            #    by its global number of occurences
            cat_frequencies.append(round((row[cat] / row.global_occurences), 3))

        # Create a new column with the frequencies of this category
        df[str('freq - ' + cat)] = pd.Series(cat_frequencies)

    return df


# Apply the function
loop_frequencies_calculation(ngrams_reduced)

Unnamed: 0,n_gram_size,pattern,global_occurences,fish meat eggs,sugary snacks,cereals and potatoes,milk and dairy products,fat and sauces,fruits and vegetables,salty snacks,...,composite foods,freq - fish meat eggs,freq - sugary snacks,freq - cereals and potatoes,freq - milk and dairy products,freq - fat and sauces,freq - fruits and vegetables,freq - salty snacks,freq - beverages,freq - composite foods
0,1-grams,"('sucre',)",268175,23256,85227,23156,34745,11009,23536,14603,...,32746,0.087,0.318,0.086,0.130,0.041,0.088,0.054,0.074,0.122
1,1-grams,"('sel',)",262603,44526,38631,22952,22495,14217,16096,25072,...,70507,0.170,0.147,0.087,0.086,0.054,0.061,0.095,0.031,0.268
2,1-grams,"('ingredient',)",175998,23229,38021,18434,19928,8964,15270,13398,...,25482,0.132,0.216,0.105,0.113,0.051,0.087,0.076,0.075,0.145
3,1-grams,"('gras',)",153330,22344,26717,14921,18967,9457,10093,18211,...,24038,0.146,0.174,0.097,0.124,0.062,0.066,0.119,0.056,0.157
4,1-grams,"('lait',)",152570,7755,41706,9879,58044,3496,1572,6425,...,22578,0.051,0.273,0.065,0.380,0.023,0.010,0.042,0.007,0.148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38862,10-grams,"('sature', 'glucides', 'sucre', 'fibre', 'alimentaires', 'proteines', 'sel', 'valeurs', 'nutritionnelles', 'moyennes')",15,2,3,1,2,1,1,3,...,2,0.133,0.200,0.067,0.133,0.067,0.067,0.200,0.133,0.133
38863,10-grams,"('acides', 'gras', 'sature', 'glucides', 'sucre', 'fibre', 'alimentaires', 'proteines', 'sel', 'valeurs')",15,2,3,1,2,1,1,3,...,2,0.133,0.200,0.067,0.133,0.067,0.067,0.200,0.133,0.133
38864,10-grams,"('acides', 'gras', 'satures', 'glucides', 'sucre', 'fibre', 'alimentaires', 'proteines', 'sel', 'con')",15,3,2,1,1,1,3,2,...,2,0.200,0.133,0.067,0.067,0.067,0.200,0.133,0.067,0.133
38865,10-grams,"('acides', 'gras', 'satures', 'glucides', 'sucre', 'fibre', 'alimentaires', 'proteines', 'sel', 'er')",13,1,2,2,1,1,1,2,...,2,0.077,0.154,0.154,0.077,0.077,0.077,0.154,0.077,0.154


time: 51.8 s (started: 2021-03-03 09:56:26 +01:00)


In [10]:
# Brief look at the dataset
print(f"""Shape of the dataset: {ngrams_reduced.shape}
""")
print(f"""Columns types of the dataset: 
{ngrams_reduced.dtypes}
""")
print("Head of the dataset:")
display(ngrams_reduced.head())

Shape of the dataset: (38867, 21)

Columns types of the dataset: 
n_gram_size                        object
pattern                            object
global_occurences                   int64
fish meat eggs                      int64
sugary snacks                       int64
cereals and potatoes                int64
milk and dairy products             int64
fat and sauces                      int64
fruits and vegetables               int64
salty snacks                        int64
beverages                           int64
composite foods                     int64
freq - fish meat eggs             float64
freq - sugary snacks              float64
freq - cereals and potatoes       float64
freq - milk and dairy products    float64
freq - fat and sauces             float64
freq - fruits and vegetables      float64
freq - salty snacks               float64
freq - beverages                  float64
freq - composite foods            float64
dtype: object

Head of the dataset:


Unnamed: 0,n_gram_size,pattern,global_occurences,fish meat eggs,sugary snacks,cereals and potatoes,milk and dairy products,fat and sauces,fruits and vegetables,salty snacks,...,composite foods,freq - fish meat eggs,freq - sugary snacks,freq - cereals and potatoes,freq - milk and dairy products,freq - fat and sauces,freq - fruits and vegetables,freq - salty snacks,freq - beverages,freq - composite foods
0,1-grams,"('sucre',)",268175,23256,85227,23156,34745,11009,23536,14603,...,32746,0.087,0.318,0.086,0.13,0.041,0.088,0.054,0.074,0.122
1,1-grams,"('sel',)",262603,44526,38631,22952,22495,14217,16096,25072,...,70507,0.17,0.147,0.087,0.086,0.054,0.061,0.095,0.031,0.268
2,1-grams,"('ingredient',)",175998,23229,38021,18434,19928,8964,15270,13398,...,25482,0.132,0.216,0.105,0.113,0.051,0.087,0.076,0.075,0.145
3,1-grams,"('gras',)",153330,22344,26717,14921,18967,9457,10093,18211,...,24038,0.146,0.174,0.097,0.124,0.062,0.066,0.119,0.056,0.157
4,1-grams,"('lait',)",152570,7755,41706,9879,58044,3496,1572,6425,...,22578,0.051,0.273,0.065,0.38,0.023,0.01,0.042,0.007,0.148


time: 24.5 ms (started: 2021-03-03 09:58:08 +01:00)


In [11]:
ngrams_reduced.columns

Index(['n_gram_size', 'pattern', 'global_occurences', 'fish meat eggs',
       'sugary snacks', 'cereals and potatoes', 'milk and dairy products',
       'fat and sauces', 'fruits and vegetables', 'salty snacks', 'beverages',
       'composite foods', 'freq - fish meat eggs', 'freq - sugary snacks',
       'freq - cereals and potatoes', 'freq - milk and dairy products',
       'freq - fat and sauces', 'freq - fruits and vegetables',
       'freq - salty snacks', 'freq - beverages', 'freq - composite foods'],
      dtype='object')

time: 3.57 ms (started: 2021-03-03 10:02:05 +01:00)


In [12]:
# Drop columns
ngrams_frequencies_columns = ['n_gram_size', 
                              'pattern', 
                              'freq - fish meat eggs', 
                              'freq - sugary snacks', 
                              'freq - cereals and potatoes', 
                              'freq - milk and dairy products', 
                              'freq - fat and sauces', 
                              'freq - fruits and vegetables', 
                              'freq - salty snacks', 
                              'freq - beverages', 
                              'freq - composite foods',
                             ]
ngrams_frequencies = ngrams_reduced[ngrams_frequencies_columns].copy()

time: 9.66 ms (started: 2021-03-03 10:04:28 +01:00)


In [18]:
# Brief look at the dataset
print(f"""Shape of the dataset: {ngrams_frequencies.shape}
""")
print(f"""Columns types of the dataset: 
{ngrams_frequencies.dtypes}
""")
print("Head of the dataset:")
display(ngrams_frequencies.head())

Shape of the dataset: (38867, 11)

Columns types of the dataset: 
n_gram_size                        object
pattern                            object
freq - fish meat eggs             float64
freq - sugary snacks              float64
freq - cereals and potatoes       float64
freq - milk and dairy products    float64
freq - fat and sauces             float64
freq - fruits and vegetables      float64
freq - salty snacks               float64
freq - beverages                  float64
freq - composite foods            float64
dtype: object

Head of the dataset:


Unnamed: 0,n_gram_size,pattern,freq - fish meat eggs,freq - sugary snacks,freq - cereals and potatoes,freq - milk and dairy products,freq - fat and sauces,freq - fruits and vegetables,freq - salty snacks,freq - beverages,freq - composite foods
0,1-grams,"('sucre',)",0.087,0.318,0.086,0.13,0.041,0.088,0.054,0.074,0.122
1,1-grams,"('sel',)",0.17,0.147,0.087,0.086,0.054,0.061,0.095,0.031,0.268
2,1-grams,"('ingredient',)",0.132,0.216,0.105,0.113,0.051,0.087,0.076,0.075,0.145
3,1-grams,"('gras',)",0.146,0.174,0.097,0.124,0.062,0.066,0.119,0.056,0.157
4,1-grams,"('lait',)",0.051,0.273,0.065,0.38,0.023,0.01,0.042,0.007,0.148


time: 151 ms (started: 2021-03-03 10:24:57 +01:00)


In [19]:
# Save ngrams_frequencies dataset to csv
ngrams_frequencies.to_csv('../raw_data/ngrams_frequencies.csv', index = False)

time: 377 ms (started: 2021-03-03 10:25:42 +01:00)


## 2. Transpose datasets of N-grams (Tidy Data shape)

file ='../OpenFoodFacts/data/ngrams_frequencies.csv'
ngrams_frequencies_base = pd.read_csv(file)

ngrams_frequencies = ngrams_frequencies_base.copy()
ngrams_frequencies.head()

ngrams_frequencies.columns

ngrams_frequencies_col = ['freq - fish meat eggs', 'freq - sugary snacks', 'freq - cereals and potatoes', 'freq - milk and dairy products', 'freq - fat and sauces', 'freq - fruits and vegetables', 'freq - salty snacks', 'freq - beverages', 'freq - composite foods']

pattern_frequencies = pd.DataFrame(ngrams_frequencies.set_index(['pattern', 'n_gram_size']).stack())

type(ngrams_frequencies)

type(pattern_frequencies)

pattern_frequencies[:10]

pattern_frequencies.head()

pattern_frequencies.unstack()