<h1 align="center"><strong>Design an application for public health - Project 3</strong></h1>
<h2 align="center">| Cleaning notebook |</h2>
<h3 align="center">Data Scientist course - OpenClassrooms</h3>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">1. Libraries and functions</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.1. Libraries and functions</h4>
</div>

In [1]:
import os
import io
import gc
import math
from math import prod
from collections import Counter
import time as time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import missingno as msno
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_is_fitted
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline

sns.set_theme(style="darkgrid")

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.2. Functions declaration</h4>
</div>

In [6]:
def df_analysis(df, name_df, columns, *args, **kwargs):
    """
    Method used for analyzing on the DataFrame.

    Parameters:
    -----------------
        df (pandas.DataFrame): Dataset to analyze
        name_df (str): Dataset name
        columns (list): Dataframe keys in list format
        
        *args, **kwargs:
        -----------------
            flag (str): Flag to show complete information about the dataset to analyse
                        "complete" shows all information about the dataset

    Returns:
    -----------------
        None. 
        Print the analysis on the Dataset. 
    """
    
    # Getting the variables
    flag = kwargs.get("flag", None)
    
    ORDERING_COMPLETE = [
        "name", "type", "records", "unique", "# NaN", "% NaN", "mean", "min", "25%", "50%", "75%", "max", "std"
    ]
    
    # Calculating the memory usage based on dataframe.info()
    buf = io.StringIO()
    df.info(buf=buf)
    memory_usage = buf.getvalue().split('\n')[-2]
    
    if df.empty:
        print("The", name_df, "dataset is empty. Please verify the file.")
    else:
        empty_cols = [col for col in df.columns if df[col].isna().all()] # identifying empty columns
        df_rows_duplicates = df[df.duplicated()] #identifying full duplicates rows
        
        # Creating a dataset based on Type object and records by columns
        type_cols = df.dtypes.apply(lambda x: x.name).to_dict() 
        df_resume = pd.DataFrame(list(type_cols.items()), columns = ["name", "type"])
        df_resume["records"] = list(df.count())
        df_resume["# NaN"] = list(df.isnull().sum())
        df_resume["% NaN"] = list(((df.isnull().sum() / len(df.index))*100).round(2))
        
        print("\nAnalysis of", name_df, "dataset")
        print("--------------------------------------------------------------------")
        print("- Dataset shape:                 ", df.shape[0], "rows and", df.shape[1], "columns")
        print("- Total of NaN values:           ", df.isna().sum().sum())
        print("- Percentage of NaN:             ", round((df.isna().sum().sum() / prod(df.shape)) * 100, 2), "%")
        print("- Total of full duplicates rows: ", df_rows_duplicates.shape[0])
        print("- Total of empty rows:           ", df.shape[0] - df.dropna(axis="rows", how="all").shape[0]) if df.dropna(axis="rows", how="all").shape[0] < df.shape[0] else \
                    print("- Total of empty rows:            0")
        print("- Total of empty columns:        ", len(empty_cols))
        print("  + The empty column is:         ", empty_cols) if len(empty_cols) == 1 else \
                    print("  + The empty column are:         ", empty_cols) if len(empty_cols) >= 1 else None
        print("- Unique indexes:                ", df.index.is_unique)
        
        print("\n- The key(s):", columns, "is not present multiple times in the dataframe.\n  It CAN be used as a primary key.") if df.size == df.drop_duplicates(columns).size else \
            print("\n- The key(s):", columns, "is present multiple times in the dataframe.\n  It CANNOT be used as a primary key.")
        
        pd.set_option("display.max_rows", None) # show full of showing rows
        pd.set_option("display.max_columns", None) # show full of showing cols
        pd.set_option("max_colwidth", None) # show full width of showing cols
        
        if flag is None or flag != "complete":
            print("\n- Type object and records by columns   (",memory_usage,")")
            print("--------------------------------------------------------------------")
        elif flag == "complete":
            df_resume["unique"] = list(df.nunique())
            df_desc = pd.DataFrame(df.describe().T).reset_index()
            df_desc = df_desc.rename(columns={"index": "name"})
            df_resume = df_resume.merge(right=df_desc[["name", "mean", "min", "25%", "50%", "75%", "max", "std"]], on="name", how="left")
            df_resume = df_resume[ORDERING_COMPLETE]
            print("\n- Type object and records by columns                                                                   (",memory_usage,")")
            print("---------------------------------------------------------------------------------------------------------------------------------")
        
        display(df_resume.sort_values("records", ascending=False))
        
        pd.reset_option("display.max_rows") # reset max of showing rows
        pd.reset_option("display.max_columns") # reset max of showing cols
        pd.reset_option("display.max_colwidth") # reset width of showing cols
        
        # deleting dataframe to free memory
        if flag == "complete":
            del [[df_resume, df_desc]]
            gc.collect()
            df_resume, df_desc = (pd.DataFrame() for i in range(2))
        else:
            del df_resume
            gc.collect()
            df_resume = pd.DataFrame()
            
               
def plot_values_missingno(df, first_col, last_col, *args, **kwargs):
    """
    Plotting missing values with missingno

    Parameters
    -----------------
        df (pandas.DataFrame): Dataset to analyze.
        first_col (int): First column to graph.
        last_col (int): Last column to graph.
        numbers_plot (int): Total numbers of graphs.
        plot_number (int): Number of the graph.
        
    Returns:
    -----------------
        None. 
        Plotting the missing values with missingno
    """
    
    # Getting the variables
    numbers_plot = kwargs.get("numbers_plot", None)
    plot_number = kwargs.get("plot_number", None)
    
    
    fig, axs = plt.subplots(2,1)
    msno.matrix(df.iloc[:, first_col:last_col], sparkline=False, fontsize=14, ax=axs[0])
    msno.bar(df.iloc[:, first_col:last_col], ax=axs[1], fontsize=14)            

    for ax in axs:
        labels = [item.get_text() for item in ax.get_xticklabels()]
        short_labels = [s[:8] + "..." + s[-8:] if len(s) > 16 else s for s in labels]
        ax.axes.set_xticklabels(short_labels)

    fig.set_size_inches(18,14)
    [ax.grid() for ax in axs.flatten()];
    [sns.despine(ax=ax, right=False, left=False, top=False, bottom=False) for ax in axs.flatten()];

    plt.subplots_adjust(hspace=0.3)
    plt.tight_layout(rect=[0, 0.05, 1, 0.92])
    
    if plot_number == None:
        fig.suptitle("Missing data overview", fontweight="bold", fontsize=20)
    else:
        fig.suptitle("Missing data overview\n(part " + str(plot_number) + "/" + str(numbers_plot) + ")", fontweight="bold", fontsize=20)       

    plt.show()
                        
            
def plot_missing_values(df, numbers_col):
    """
    Method used for plotting missing values

    Parameters
    -----------------
        df (pandas.DataFrame): Dataset to analyze.
        numbers_col (int): Number of columns to show in each image.
        
    Returns:
    -----------------
        None. 
    """
    
    first_col = 0
    
    if df.shape[1] // numbers_col != 0:
        
        for i in range (1, df.shape[1] // numbers_col + 1):
            
            if i == df.shape[1] // numbers_col:
                last_col = None
            else:
                last_col = i * numbers_col

            plot_values_missingno(df, first_col, last_col, numbers_plot=(df.shape[1] // numbers_col), plot_number=i)

            if i == data.shape[1] // 30:
                # deleting dataframe to free memory
                del [df]
                gc.collect()
                df = pd.DataFrame()

                break            
            else:
                first_col = last_col + 1
    else:
        
        plot_values_missingno(df, first_col, None)
            
        del [df]
        gc.collect()
        df = pd.DataFrame()
        

def encode(df):
    """
    Function to encode non-null data and replace it in the original data.

    Parameters:
    -----------------
        df (pandas.DataFrame): Dataset to analyze

    Returns:
    -----------------
        df (pandas.DataFrame): DataFrame comparison.
        
    """
    
    # Instante package to use
    encoder = OrdinalEncoder()

    # Retains only non-null values
    no_nulls = np.array(df.dropna())
    
    # Reshapes the df for encoding
    impute_reshape = no_nulls.reshape(-1,1)
    
    # Encode df
    impute_ordinal = encoder.fit_transform(impute_reshape)
    
    # Assign back encoded values to non-null values
    df.loc[df.notnull()] = np.squeeze(impute_ordinal)
    
    return df
        

class GroupImputer(BaseEstimator, TransformerMixin):
    """ 
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters:
    -----------------  
        group_cols (list) : List of columns used for calculating the aggregated value 
        strategy (str) : The strategy to be used for remplacement, can be one of ["mean", "median", "mode"]
        
    Returns:
    -----------------
        X (array-like) : The array with imputed values in the target column
   """    
    
    def __init__(self, group_cols, target, strategy="mean"):
        
        assert strategy in ["mean", "median"], "Unrecognized value for metric, should be mean/median"
        assert type(group_cols) == list, "group_cols should be a list of columns"
        assert type(target) == list, "target should be a string"
        
        self.group_cols = group_cols
        self.target = target
        self.strategy = strategy
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, "There are missing values in group_cols"
        
        impute_map = X.groupby(self.group_cols)[self.target].agg(self.strategy) \
                                                            .reset_index(drop=False)
        
        self.impute_map_ = impute_map
        
        return self 
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, "impute_map_")
        
        X = X.copy()
        
        for index, row in self.impute_map_.iterrows():
            ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
            X.loc[ind, self.target] = X.loc[ind, self.target].fillna(row[self.target])
        
        return X.values

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2. Importing files</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">2.1. Importing and preparing files</h4>
</div>

<div class="alert alert-block alert-info">
Reading data in <b>chunks of 1 million rows</b> at a time
</div>

In [3]:
start = time.time()
chunk = pd.read_csv("datasets/en.openfoodfacts.org.products.csv", chunksize=1000000, sep="\t", encoding="UTF-8")
data = pd.concat(chunk)
end = time.time()
print("Read csv with chunks: ",(end-start),"sec")

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Read csv with chunks:  96.31827020645142 sec


<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">2.2. Missing data overview</h4>
</div>

<div class="alert alert-block alert-info">
Making <b>the initial analysis</b>
</div>

In [4]:
df_analysis(data, "data", "code")


Analysis of data dataset
--------------------------------------------------------------------
- Dataset shape:                  1760097 rows and 186 columns
- Total of NaN values:            260478039
- Percentage of NaN:              79.56 %
- Total of full duplicates rows:  1
- Total of empty rows:            0
- Total of empty columns:         5
  + The empty column are:          ['cities', 'allergens_en', 'no_nutriments', 'ingredients_from_palm_oil', 'ingredients_that_may_be_from_palm_oil']
- Unique indexes:                 True

- The key(s): code is present multiple times in the dataframe.
  It CANNOT be used as a primary key.

- Type object and records by columns   ( memory usage: 2.4+ GB )
--------------------------------------------------------------------


Unnamed: 0,name,type,records,# NaN,% NaN
0,code,object,1760097,0,0.0
6,last_modified_datetime,object,1760097,0,0.0
63,states_en,object,1760097,0,0.0
62,states_tags,object,1760097,0,0.0
61,states,object,1760097,0,0.0
1,url,object,1760097,0,0.0
5,last_modified_t,int64,1760097,0,0.0
4,created_datetime,object,1760097,0,0.0
3,created_t,int64,1760097,0,0.0
2,creator,object,1760093,4,0.0


<div class="alert alert-block alert-warning">
    Based on the function <b>df_analysis</b> we got the following facts:
    <ul style="list-style-type: square;">
        <li>Almost <b>80%</b> of the data present in data set are <b>missing values</b></li>
        <li>There are <b>5 empty columns</b> that we can delete</li>
        <li>There is a lot of <b>memory usage</b> with this dataset</li>
    </ul>
</div>

<div class="alert alert-block alert-info">
Plotting missing values with <b>missingno</b>
</div>

In [5]:
plot_missing_values(data, 30)

NameError: name 'plot_missing_values' is not defined

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3. Initial cleaning</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.1. Deleting NaN columns and rows, and duplicated rows</h4>
</div>

In [7]:
data = data.dropna(axis="columns", how="all").dropna(axis="rows", how="all")

In [8]:
data = data.drop_duplicates()

In [9]:
df_analysis(data, "data", "code")


Analysis of data dataset
--------------------------------------------------------------------
- Dataset shape:                  1760096 rows and 181 columns
- Total of NaN values:            251677390
- Percentage of NaN:              79.0 %
- Total of full duplicates rows:  0
- Total of empty rows:            0
- Total of empty columns:         0
- Unique indexes:                 True

- The key(s): code is present multiple times in the dataframe.
  It CANNOT be used as a primary key.

- Type object and records by columns   ( memory usage: 2.4+ GB )
--------------------------------------------------------------------


Unnamed: 0,name,type,records,# NaN,% NaN
0,code,object,1760096,0,0.0
58,states_en,object,1760096,0,0.0
3,created_t,int64,1760096,0,0.0
4,created_datetime,object,1760096,0,0.0
5,last_modified_t,int64,1760096,0,0.0
6,last_modified_datetime,object,1760096,0,0.0
57,states_tags,object,1760096,0,0.0
1,url,object,1760096,0,0.0
56,states,object,1760096,0,0.0
2,creator,object,1760092,4,0.0


<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.2. Fixing the columns types</h4>
</div>

<div class="alert alert-block alert-warning">
There are some columns with column type wrong. We can see this information based on:
<ul style="list-style-type: square;">
    <li>Open Food Facts data-fileds: <b>https://world.openfoodfacts.org/data/data-fields.txt</b></li>
    <li>At the momento to <b>import the *.csv</b></li>
    <li>As a result of the function <b>df_initial_analysis</b></li>
</ul>

Due to that, we are going to proceed to fix them. 
</div>
<div class="alert alert-block alert-info">
<b>Fixing column types float64</b><br>
Based on Open Food Facts data-fileds: <b>https://world.openfoodfacts.org/data/data-fields.txt</b>, we know the following
<ul style="list-style-type: square;">
    <li>fields that end with <b>_100g</b> correspond to <b>the amount of a nutriment</b> (in g, or kJ for energy) for <b>100 g</b> or <b>100 ml</b> of product</li>
    <li>fields that end with <b>_serving</b> correspond to <b>the amount of a nutriment</b> (in g, or kJ for energy) for <b>1</b> serving of the product</li>
</ul>

Finally, we are going to make the <b>downcast</b> from <b>float64</b> to <b>float32</b> 
</div>

In [10]:
for col in data.columns:
    if (col.endswith("_100g") or col.endswith("_serving")) and data[col].dtypes != "float64":
        print("Column to update the column type:", col, data[col].dtypes)
        data[col] = data[col].astype("float64")

Column to update the column type: -butyric-acid_100g object
Column to update the column type: -capric-acid_100g object


In [11]:
for col in data.columns:
    if data[col].dtypes == "float64":
        data[col] = data[col].astype("float32")

<div class="alert alert-block alert-info">
<b>Fixing column types datetime</b><br>
Based on Open Food Facts data-fileds: <b>https://world.openfoodfacts.org/data/data-fields.txt</b>, we know the following:
<ul style="list-style-type: square;">
    <li>fields that end with <b>_t</b> are dates in <b>the UNIX timestamp format</b> (number of seconds since Jan 1st 1970)</b></li>
    <li>fields that end with <b>_datetime</b> are dates in <b>the iso8601 format: yyyy-mm-ddThh:mn:ssZ</b></li>
</ul>

In this case, both group of columns have the same amount of data. Due to that, we will only keep the columns ended by <b>_datetime</b>
</div>

In [12]:
for col in data.columns:
    if col.endswith("_datetime") and data[col].dtypes != "datetime64":
        print("Column to update the column type:", col, data[col].dtypes)
        data[col] = data[col].astype("datetime64")

Column to update the column type: created_datetime object
Column to update the column type: last_modified_datetime object


In [13]:
for col in data.columns:
    if col.endswith("_t"):
        print("Column to delete:", col, data[col].dtypes)
        data = data.drop(col, axis=1)  

Column to delete: created_t int64
Column to delete: last_modified_t int64


<div class="alert alert-block alert-info">
Checking the usage of memory
</div>

In [14]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1760096 entries, 0 to 1760096
Columns: 179 entries, code to carnitine_100g
dtypes: datetime64[ns](2), float32(118), object(59)
memory usage: 8.1 GB


<div class="alert alert-block alert-info">
If the number of unique values in a specific column is less than the half numbers of rows in the same column,<br>
we are going to change the type from <b>object</b> to <b>category</b> to improve the usage of memory
</div>

In [15]:
for col in data.columns:
    if data[col].dtypes == "object":
        if len(data[col].unique()) / len(data[col]) < 0.5:
            data[col] = data[col].astype("category")
        else:
            data[col] = data[col].astype("object")

<div class="alert alert-block alert-info">
Checking the usage of memory
</div>

In [16]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1760096 entries, 0 to 1760096
Columns: 179 entries, code to carnitine_100g
dtypes: category(54), datetime64[ns](2), float32(118), object(5)
memory usage: 2.9 GB


In [17]:
df_analysis(data, "data", "code")


Analysis of data dataset
--------------------------------------------------------------------
- Dataset shape:                  1760096 rows and 179 columns
- Total of NaN values:            251677390
- Percentage of NaN:              79.88 %
- Total of full duplicates rows:  0
- Total of empty rows:            0
- Total of empty columns:         0
- Unique indexes:                 True

- The key(s): code is present multiple times in the dataframe.
  It CANNOT be used as a primary key.

- Type object and records by columns   ( memory usage: 1.4+ GB )
--------------------------------------------------------------------


Unnamed: 0,name,type,records,# NaN,% NaN
0,code,object,1760096,0,0.0
54,states,category,1760096,0,0.0
56,states_en,category,1760096,0,0.0
55,states_tags,category,1760096,0,0.0
1,url,object,1760096,0,0.0
4,last_modified_datetime,datetime64[ns],1760096,0,0.0
3,created_datetime,datetime64[ns],1760096,0,0.0
2,creator,category,1760092,4,0.0
53,pnns_groups_2,category,1759729,367,0.02
31,countries,category,1754603,5493,0.31


<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">4. Filtering information</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">4.1. Filtering information by country France</h4>
</div>

<div class="alert alert-block alert-info">
<b>Countries where the product is sold</b><br>
Based on Open Food Facts data-fileds: <b>https://world.openfoodfacts.org/data/data-fields.txt</b>, we know the following
<ul style="list-style-type: square;">
    <li> There are three fields about countries where the product is sold</li>
        <ul style="list-style-type: disc;">
            <li>countries</li>
            <li>countries_en</li>
            <li>countries_tags</li>
        </ul>
</ul>

After analyzing the columns, we have decided to used the columns called <b>countries</b>.<br><br>
We are going to filter considering only <b>France</b> and its overseas department
</div>

In [18]:
FILTERING_BY_FRANCE = [
    "France", "fr", "Francia", "French", "Frankreich", "Nouvelle-Calédonie", "Martinique", "Guadeloupe", "Polynésie Française", "Mayotte"
]

<div class="alert alert-block alert-success">
    At this point, we have reduced the amount of data in the dataset filtering by country <b>France</b>
</div>

In [19]:
data = pd.DataFrame(data[data["countries"].str.contains("|".join(FILTERING_BY_FRANCE), case = False, na=False).any(level=0)])

In [20]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 798009 entries, 0 to 1760096
Columns: 179 entries, code to carnitine_100g
dtypes: category(54), datetime64[ns](2), float32(118), object(5)
memory usage: 1.8 GB


In [21]:
df_analysis(data, "data", "code")


Analysis of data dataset
--------------------------------------------------------------------
- Dataset shape:                  798009 rows and 179 columns
- Total of NaN values:            114437088
- Percentage of NaN:              80.11 %
- Total of full duplicates rows:  0
- Total of empty rows:            0
- Total of empty columns:         4
  + The empty column are:          ['-lignoceric-acid_100g', '-dihomo-gamma-linolenic-acid_100g', '-elaidic-acid_100g', 'water-hardness_100g']
- Unique indexes:                 True

- The key(s): code is present multiple times in the dataframe.
  It CANNOT be used as a primary key.

- Type object and records by columns   ( memory usage: 710.3+ MB )
--------------------------------------------------------------------


Unnamed: 0,name,type,records,# NaN,% NaN
0,code,object,798009,0,0.0
54,states,category,798009,0,0.0
3,created_datetime,datetime64[ns],798009,0,0.0
4,last_modified_datetime,datetime64[ns],798009,0,0.0
55,states_tags,category,798009,0,0.0
56,states_en,category,798009,0,0.0
1,url,object,798009,0,0.0
33,countries_en,category,798009,0,0.0
32,countries_tags,category,798009,0,0.0
31,countries,category,798009,0,0.0


<div class="alert alert-block alert-info">
Deleting the complete null rows and columns
</div>

In [22]:
data = data.dropna(axis="columns", how="all").dropna(axis="rows", how="all")

<div class="alert alert-block alert-info">
Exectuting the <b>describe()</b> function on the dataframe
</div>

In [23]:
pd.set_option("display.max_columns", None) # show full of showing cols
data.describe()

  subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t>=0.5)


Unnamed: 0,serving_quantity,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,nutriscore_score,nova_group,ecoscore_score_fr,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,-soluble-fiber_100g,-insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
count,79357.0,243846.0,243846.0,243846.0,278981.0,204621.0,255512.0,71905.0,595516.0,623090.0,131.0,617113.0,620073.0,3.0,2.0,1.0,2.0,7.0,1.0,4.0,2.0,29.0,12.0,1.0,2.0,4.0,3134.0,3157.0,1348.0,340.0,63.0,99.0,292.0,203.0,31.0,4.0,49.0,21.0,3.0,1.0,2.0,2.0,3647.0,3677.0,617058.0,619338.0,24.0,29.0,53.0,228.0,13.0,57.0,290.0,765.0,139641.0,188.0,181.0,618992.0,39.0,31.0,15.0,603563.0,603562.0,9209.0,3396.0,49.0,1611.0,1785.0,178.0,4833.0,2188.0,1798.0,1786.0,1978.0,1462.0,238.0,1597.0,403.0,928.0,77.0,231.0,2683.0,424.0,8408.0,1618.0,5291.0,2472.0,930.0,372.0,392.0,197.0,281.0,47.0,45.0,343.0,111.0,57.0,130.0,6300.0,307.0,10488.0,296.0,4409.0,3.0,319.0,11632.0,278984.0,5.0,4.0,31.0,29.0,18.0,24.0,14.0
mean,118.894379,1.615872,0.048883,0.114092,9.532871,3.41138,48.509956,1103.681,283.2928,1178.383,432.374054,14.309522,5.446947,2.582482e+20,24.014999,97.0,inf,27.35339,0.12,1.532558,2.1,3.947972,1.257143,0.0,31.0,0.5264027,20.867825,9.733136,3.332329,2.166044,2.646438,0.844953,17.107796,3.695531,0.532343,0.82375,42.33134,30.364353,11.001293,0.00057,0.0155003,7.0,0.035219,0.048734,27.3657,13.761168,17.400002,13.02966,29.481323,8.522764,8.162395,5.817037,32.900581,44.481972,3.022539,2.005319,4.316906,9.138214,4.165897,4.316871,0.796109,1.310894,0.52446,6.043071,0.54604,0.869422,0.1870809,0.187911,0.001249,0.229725,0.900481,0.050388,0.163426,0.579851,0.334001,0.001656,0.03638954,0.377905,0.033736,0.052891,1.284708,1.123691,0.078861,0.9673,0.297043,0.06382,0.889342,0.172939,0.01012,0.013097,0.01148145,0.186706,0.008913,0.418667,0.182235,0.434897,1.348675,6.58399,39.357758,16.623857,46.186378,14.982096,50.513916,1.452667,279.281067,622.308594,9.532973,12.2,34.174999,0.096719,0.407549,3.431667,0.034034,0.048264
std,778.501587,2.375888,0.218628,0.400777,8.777076,0.968259,26.653727,4161.248,3785.576,15538.6,704.321167,117.423164,29.066158,inf,33.91991,,,25.47459,,2.978924,2.969848,17.195362,4.265184,,42.426407,1.049067,25.062717,14.689106,6.277025,7.441453,11.680713,2.250437,18.865404,6.631429,2.649692,0.715255,25.87613,26.712978,19.051439,,0.02191988,9.899494,0.213637,0.919162,151.49057,21.925907,28.19067,13.93188,27.443153,17.41291,14.384439,17.094442,24.662666,35.666641,5.748063,2.866919,5.575809,220.102295,2.948128,5.55038,2.353545,32.95063,13.180311,10.267507,14.705605,5.424057,3.626899,2.712475,0.011315,3.081298,14.274102,1.034617,2.349888,11.352082,5.099961,0.020308,0.5839894,5.405313,0.321212,0.203069,9.252451,18.116951,0.184351,16.842674,1.382632,1.057952,13.548791,3.148226,0.066075,0.093077,0.0652572,1.702784,0.058307,2.802404,1.986521,1.063725,4.762463,1.644223,36.253693,31.006561,28.991516,3.21627,23.350037,0.798559,850.399353,6142.984863,8.777094,10.034939,15.620153,0.170812,2.191082,1.896605,0.031163,0.15075
min,0.0,0.0,0.0,0.0,-15.0,1.0,-23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00682,0.03,97.0,0.006,0.002,0.12,9.2e-08,0.0,0.0,0.0,0.0,1.0,8.1e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095,1.6e-08,0.0015,1e-06,0.00057,6.1e-07,1e-06,0.0,0.0,-1.0,-1.0,0.0,2e-08,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.3,3.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-07,0.0,2e-06,0.0,0.0,0.0,0.00035,0.0058,0.0,0.0,0.0,3.7,0.0,0.6,0.0,0.049,-15.0,1.0,14.0,0.0,0.0,0.4,0.002,0.0065
25%,29.0,0.0,0.0,0.0,2.0,3.0,29.0,435.0,111.0,464.0,6.5,1.0,0.2,0.29341,12.0225,97.0,inf,0.235865,0.12,0.000174023,1.05,9.2e-05,1.1e-05,0.0,16.0,0.0006077025,2.2,1.5,0.582,0.06775,0.2,0.061,2.075,0.438,0.00175,0.47375,27.0,6.9,0.001941,0.00057,0.007750458,3.500001,0.0,0.0,2.6,0.6,0.275,0.9,1.5,0.0,0.01,0.098,9.4,8.0,0.1,0.0,0.0,1.5,1.4,0.4075,0.0198,0.06,0.024,0.0,0.0,1.2e-05,7.55e-07,0.0018,5e-06,0.0,0.00018,0.00021,0.0029,0.00026,3.1e-05,2.1e-05,3.8e-07,7e-06,0.0009,0.0015,0.0246,0.0585,0.0012,0.03,0.12,0.00072,0.02,0.0009,0.000167,5.9e-05,1.5e-05,4e-06,1e-05,7e-06,1.6e-05,0.02,0.036,6.0,4.09,0.0,17.0,12.0,32.0,1.0875,0.0,111.0,2.0,3.0,26.0,0.0565,1.6e-05,1.7775,0.02,0.007
50%,80.0,1.0,0.0,0.0,10.0,4.0,43.0,977.0,265.0,1100.0,209.0,8.0,2.0,0.58,24.014999,97.0,,45.0,0.12,0.065116,2.1,0.00682,0.0018,0.0,31.0,0.002805,8.9,3.9,1.8,0.12,0.5,0.36,9.3,0.623,0.047,0.7,42.0,21.4,0.00388,0.00057,0.0155003,7.0,0.0,0.0,14.0,3.5,2.9,8.1,28.0,0.02,0.27,0.123,37.0,39.799999,1.7,1.0,2.0,6.2,2.9,2.6,0.022,0.51,0.204,0.5,3e-06,3.6e-05,1.5e-06,0.006,2e-05,0.0109,0.00056,0.000668,0.007844,0.0008,0.00011,6.6e-05,1.45e-06,1.2e-05,0.0023,0.0035,0.08,0.185,0.009,0.12,0.219,0.00257,0.079,0.00223,0.000387,0.0006,0.0001,1.4e-05,2e-05,3.9e-05,4.8e-05,0.032,0.33,7.2,30.630001,0.0,50.0,15.0,51.0,1.575,123.0,333.0,10.0,13.0,37.0,0.069,2.9e-05,3.75,0.025,0.0073
75%,140.0,2.0,0.0,0.0,16.0,4.0,71.0,1610.0,402.0,1674.0,533.5,22.0,8.0,3.873723e+20,36.0075,97.0,,48.5,0.12,1.5975,3.15,0.219,0.03925,0.0,46.0,0.5286,28.0,10.71,3.425,0.5925,0.885,0.8,24.549999,3.6,0.0625,1.05,63.42,53.700001,16.50194,0.00057,0.02325015,10.5,0.0,0.003,52.0,19.985256,28.424999,23.1,53.0,5.95,4.8,2.2,53.75,71.099998,3.67,3.0,7.0,12.9,5.95,5.6,0.024,1.3,0.52,8.2,0.0002,0.004522,6.4e-06,0.0137,3.8e-05,0.025,0.001,0.0012,0.0133,0.0013,0.00017,0.000192,2.125e-06,4e-05,0.0051,0.015,0.215775,0.428,0.054,0.186,0.331,0.0057,0.142,0.005,0.000843,0.002,0.00045,3.9e-05,6.4e-05,7.7e-05,0.000116,0.062,0.4,7.5,65.0,10.1,64.0,15.0,70.0,1.879,297.5,614.200012,16.0,22.0,45.174999,0.089,5.9e-05,4.9,0.03425,0.00925
max,100660.0,30.0,3.0,6.0,40.0,4.0,125.0,1094259.0,2910000.0,12200000.0,3740.0,91200.0,21900.0,7.747446e+20,48.0,97.0,inf,49.0,0.12,6.0,4.2,92.599998,14.8,0.0,61.0,2.1,87.0,77.0,105.0,75.0,85.0,18.0,72.0,36.099998,14.8,1.8,100.0,76.0,33.0,0.00057,0.031,14.0,7.6,32.0,117000.0,6880.0,92.800003,55.0,100.0,74.5,39.200001,120.0,87.5,100.0,1010.0,20.0,45.799999,173000.0,10.7,21.0,8.9,25000.0,10000.0,100.0,585.0,38.0,100.0,90.0,0.1429,100.0,571.0,41.0,69.0,450.0,117.0,0.311111,20.0,100.0,6.8,1.5,100.0,696.0,1.98,930.0,55.0,60.900002,506.0,93.919998,0.9,1.3,0.56,20.0,0.4,18.799999,29.0,5.882,33.0,8.56,100.0,100.0,100.0,25.0,100.0,2.183,13867.0,656298.625,40.0,22.0,48.700001,1.0,11.8,7.3,0.15,0.572


In [24]:
pd.reset_option("display.max_columns") # reset max of showing cols

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">5. Application idea fields</h3>
</div>

<div style="background-color: #6D83C5;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">5.1. Idenfiying Application idea fields</h4>
</div>

<div class="alert alert-block alert-info">
Based on <b>Santé publique France</b> we can get the fiedls to calculate the <b>Nutri-Score</b><br><br>
You can see the information in details in the link below<br>
<b>https://www.santepubliquefrance.fr/determinants-de-sante/nutrition-et-activite-physique/articles/nutri-score</b><br><br>
    
<b>Application fields</b>
<ul style="list-style-type: square;">
    <li>Energetic value</li>
    <li>Fat</li>
    <li>Saturated fat</li>
    <li>Carbohydrates</li>
    <li>Sugar</li>
    <li>Protein</li>
    <li>Salt / Sodium</li>
    <li>Fibers</li>
    <li>Fruits, vegetables, legumes, nuts, rapeseed, walnut and olive oils</li>
    <li>Nutri Score</li>
    <li>Nutri Score grade</li>
</ul>
</div>

In [24]:
pd.set_option("display.max_columns", None)
data.head(5)

Unnamed: 0,code,url,creator,created_datetime,last_modified_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,traces,traces_tags,traces_en,serving_size,serving_quantity,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_tags,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,brand_owner,ecoscore_score_fr,ecoscore_grade_fr,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,-soluble-fiber_100g,-insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
0,17,http://world-en.openfoodfacts.org/product/0000...,kiliweb,2018-06-15 10:38:00,2019-06-25 11:55:18,Vitória crackers,,,,,,,,,,,,,,,,,,,,,,,,,,France,en:france,France,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,,,,375.0,1569.0,,7.0,3.08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70.099998,15.0,,,,,,,,,,,,7.8,,,,1.4,0.56,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,31,http://world-en.openfoodfacts.org/product/0000...,isagoofy,2018-10-13 21:06:14,2018-10-13 21:06:57,Cacao,,,130 g,,,,,,,,,,,,,,,,,,,,,,,France,en:france,France,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-to-be-c...","en:to-be-completed,en:nutrition-facts-to-be-co...","To be completed,Nutrition facts to be complete...",,,,,,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,100,http://world-en.openfoodfacts.org/product/0000...,del51,2015-10-11 14:09:21,2015-10-12 14:13:32,moutarde au moût de raisin,,,100g,,,,courte paille,courte-paille,"Epicerie, Condiments, Sauces, Moutardes","en:groceries,en:condiments,en:sauces,en:mustards","Groceries,Condiments,Sauces,Mustards",,,,,,Delois france,fr:delois-france,fr:delois-france,,,,,,courte paille,France,en:france,France,eau graines de téguments de moutarde vinaigre ...,en:mustard,,,,,,0.0,,,,0.0,,0.0,,18.0,d,,Fat and sauces,Dressings and sauces,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,60.0,b,en:mustards,Mustards,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,,,,,936.0,,936.0,,8.2,2.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29.0,22.0,,,,,,,,,0.0,,,5.1,,,,4.6,1.84,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,18.0,,,,,,,
5,1111111111,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,2019-06-08 18:56:13,2019-06-08 18:56:13,Sfiudwx,,,dgesc,,,,Watt,watt,Xsf,fr:xsf,fr:xsf,,,,,,,,,,,,,,,en:France,en:france,France,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-to-be-c...","en:to-be-completed,en:nutrition-facts-to-be-co...","To be completed,Nutrition facts to be complete...",,,,fr:xsf,fr:xsf,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,123,http://world-en.openfoodfacts.org/product/0000...,kiliweb,2018-08-31 17:53:02,2018-08-31 17:53:06,Sauce Sweety chili 0%,,,,,,,,,,,,,,,,,,,,,,,,,,France,en:france,France,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,https://static.openfoodfacts.org/images/produc...,,,,21.0,88.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.8,0.4,,,,,,,,,,,,0.2,,,,2.04,0.816,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [25]:
pd.reset_option("display.max_columns")

<div class="alert alert-block alert-info">
After analyzing the columns we can identify the following fields for <b>Nutri-Score</b> <b>and App idea</b> 
</div>

<table style="border: 1px solid #FF7200; color: #FF7200;" align="left">
	<tr style="border: 1px solid #FF7200; color: #FF7200;">
		<th style="font-weight: bold;">Field</th>
        <th style="font-weight: bold;">Description</th>
        <th style="font-weight: bold;">Required</th>
		<th style="font-weight: bold;">Open Food Data field</th>
        <th style="font-weight: bold;">Commentary</th>
 	</tr>
 	<tr>
		<td align="left">Energetic value</td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">
            energy-kj_100g<br>
            energy-kcal_100g<br>
            energy_100g</td>
        <td align="left">
            The fields energy-kj_100g and energy_100g seem to be the same value.<br>
            Consider the measure units to work with the values.
        </td>
 	</tr>
 	<tr>
		<td align="left">Fat</td>
        <td align="center">Nutri-score</td>
		<td align="center">No</td>
        <td align="left">fat_100g</td>
        <td align="left"></td>
 	</tr>
 	<tr>
		<td align="left">Saturated fat</td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">saturated-fat_100g</td>
        <td align="left"></td>
 	</tr>
    <tr>
		<td align="left">Carbohydrates</td>
        <td align="center">Nutri-score</td>
		<td align="center">Not</td>
        <td align="left">carbohydrates_100g</td>
        <td align="left"></td>
 	</tr>
 	<tr>
		<td align="left">Sugar</td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">sugars_100g</td>
        <td align="left"></td>
 	</tr>
  	<tr>
		<td align="left">Protein</td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">proteins_100g</td>
        <td align="left"></td>
 	</tr>
  	<tr>
		<td align="left">Salt / Sodium</td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">
            salt_100g<br>
            sodium_100g</td>
        <td align="left">Consider the measure units to work with the values.</td>
 	</tr>
  	<tr>
		<td align="left">Fibers</td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">fiber_100g</td>
        <td align="left"></td>
 	</tr>
  	<tr>
		<td align="left">Nutri Score</td>
        <td align="center">Nutri-score</td>
		<td align="center">No</td>
        <td align="left">nutriscore_score<br/>nutrition-score-fr_100g</td>
        <td align="left">It is necessary to analyze these fields more in details. </td>
 	</tr>
  	<tr>
		<td align="left">Nutri Score grade</td>
        <td align="center">Nutri-score</td>
		<td align="center">No</td>
        <td align="left">nutriscore_grade</td>
        <td align="left"></td>
 	</tr>
  	<tr>
		<td align="left">
            Fruits, vegetables, legumes,<br>
            nuts, rapeseed, walnut<br> and olive oils
        </td>
        <td align="center">Nutri-score</td>
		<td align="center">Yes</td>
        <td align="left">
            fruits-vegetables-nuts_100g<br>
            fruits-vegetables-nuts-estimate_100g
        </td>
        <td align="left">
            It is necessary to analyze these fields more in details. 
        </td>
 	</tr>
    <tr>
		<td align="left">Code</td>
        <td align="center">App/Basic field</td>
		<td align="center">Yes</td>
        <td align="left">code</td>
        <td align="left"></td>
 	</tr>
    <tr>
		<td align="left">Last modified</td>
        <td align="center">App/Basic field</td>
		<td align="center">Yes</td>
        <td align="left">last_modified_datetime</td>
        <td align="left"></td>
 	</tr>
    <tr>
		<td align="left">Product name</td>
        <td align="center">App/Basic field</td>
		<td align="center">No</td>
        <td align="left">product_name</td>
        <td align="left"></td>
 	</tr>
    <tr>
		<td align="left">Category</td>
        <td align="center">App/Basic field</td>
		<td align="center">No</td>
        <td align="left">
            categories<br>
            main_category<br>
            main_category_en<br>
            categories_tags<br>
            categories_en
        </td>
        <td align="left">It is necessary to analyze these fields more in details.</td>
 	</tr>
    <tr>
		<td align="left">Brand</td>
        <td align="center">App/Basic field</td>
		<td align="center">No</td>
        <td align="left">
            brands<br>
            brands_tags
        </td>
        <td align="left">It is necessary to analyze these fields more in details.</td>
 	</tr>
    <tr>
		<td align="left">Image</td>
        <td align="center">App/Basic field</td>
		<td align="center">No</td>
        <td align="left">image_url</td>
        <td align="left"></td>
 	</tr>
</table>