In [1]:
import pandas as pd
import numpy as np
import warnings 
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.decomposition import PCA

In [2]:
warnings.filterwarnings('ignore')

In [3]:
food = pd.read_csv('openfoodfacts.csv', engine = 'python', sep = '\t', on_bad_lines = 'skip')

In [4]:
food.shape

(320767, 162)

1. Nettoyage des variables avec trop peu de valeurs
- il y a beaucoup de variables avec très peu de valeurs, on va retenir les variables avec plus 10% individus avec valeurs .

In [5]:
food.notna().sum()/len(food)

code                       0.999928
url                        0.999928
creator                    0.999994
created_t                  0.999991
created_datetime           0.999972
                             ...   
carbon-footprint_100g      0.000835
nutrition-score-fr_100g    0.689616
nutrition-score-uk_100g    0.689616
glycemic-index_100g        0.000000
water-hardness_100g        0.000000
Length: 162, dtype: float64

In [6]:
food_1 = food
for var in food:
    if food[var].isna().sum()/len(food) > 0.9:
        food_1 = food_1.drop([var], axis = 1)

In [7]:
food_1.shape

(320767, 62)

2. Nettoyage des variables inutiles pour l'application : Notre application utilisent les variables quantitive suivantes pour chaque type de population, les autres variabales quantitatives sont eliminées : Energie (kcal)	vitamine_A (microGrammes)	vitamine C (mg)	calcium (mg), fer (mg)	protein (g)	glucide/carbonhydrate (g)	lipide/gras (g)	sel max (g)	sucre max (g)

    - eliminer les variables temps: 
    - éliminer les variables doublons qui termine par '_tags', 'url',  '_fr' (sauf grade_fr)
    - éliminer les variables non signficatives pour l'application

In [8]:
food_2 = food_1.drop(['created_t', 'last_modified_t', 'last_modified_datetime', 'created_datetime'], axis = 1)

In [9]:
for var in food_2:
    if (var[-3:] == 'url' or var[-5:] == '_tags') :
        food_2 = food_2.drop([var], axis =1)

In [10]:
for var in food_2:
    if (var[-3:] == '_fr' and var[-8:] != 'grade_fr') :
        food_2 = food_2.drop([var], axis =1)

In [11]:
food_2 = food_2.drop(['creator','categories', 'quantity', 'generic_name','packaging','manufacturing_places',
                           'labels','ingredients_text','purchase_places','main_category',
                           'stores','ingredients_from_palm_oil_n','ingredients_that_may_be_from_palm_oil_n',
                           'additives_n','additives','states', 'pnns_groups_1','pnns_groups_2',
                           'nutrition-score-uk_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g', 'sodium_100g', 'fiber_100g'], axis =1)

In [12]:
food_2.to_csv('food_2.csv')

In [13]:
food_2.shape

(320767, 17)

3. Nettoyage des individus qui ne peuvent pas être utilisés par l'application:  
        - éliminier les individus avec code ou product_name vide ou 'serving_size' vide 

In [14]:
food_2['code'].isna().sum() | food_2['product_name'].isna().sum()

17783

In [15]:
food_2['serving_size'].isna().sum()

109441

In [16]:
food_3= food_2.loc[food_2['product_name'].notna() & food_2['code'].notna() & food_2['serving_size'].notna()]

In [17]:
food_3.shape

(209386, 17)

4. Remplacer les individus dont des valeurs de nutrition sont invalides (< 0, ou > 100) par la moyenne

In [18]:
food_4 = food_3

In [19]:
food_4['fat_100g'] = food_4['fat_100g'].apply(lambda x: food_4['fat_100g'].mean() if (x > 100 or x < 0) else x)
food_4['carbohydrates_100g'] = food_4['carbohydrates_100g'].apply(lambda x: food_4['carbohydrates_100g'].mean() if (x > 100 or x < 0) else x)
food_4['sugars_100g'] = food_4['sugars_100g'].apply(lambda x: food_4['sugars_100g'].mean() if (x > 100 or x < 0) else x)
food_4['vitamin-a_100g'] = food_4['vitamin-a_100g'].apply(lambda x: food_4['vitamin-a_100g'].mean() if (x > 100 or x < 0) else x)
food_4['vitamin-c_100g'] = food_4['vitamin-c_100g'].apply(lambda x: food_4['vitamin-c_100g'].mean() if (x > 100 or x < 0) else x)
food_4['calcium_100g'] = food_4['calcium_100g'].apply(lambda x: food_4['calcium_100g'].mean() if (x > 100 or x < 0) else x)
food_4['iron_100g'] = food_4['iron_100g'].apply(lambda x: food_4['iron_100g'].mean() if (x > 100 or x < 0) else x)
food_4['nutrition-score-fr_100g'] = food_4['nutrition-score-fr_100g'].apply(lambda x: food_4['nutrition-score-fr_100g'].mean() if x > 100 else x)
food_4['salt_100g'] = food_4['salt_100g'].apply(lambda x: food_4['salt_100g'].mean() if (x > 100 or x < 0) else x)
food_4['proteins_100g'] = food_4['proteins_100g'].apply(lambda x: food_4['proteins_100g'].mean() if (x > 100 or x < 0) else x)

In [20]:
food_4.shape

(209386, 17)

In [21]:
food_4.to_csv('food_4.csv')

5. traitement des valeurs manquantes soit par la moyenne, par la médiane et par la relation 

In [22]:
food_5 = food_4

In [23]:
food_5.isnull().sum().sum()

383567

In [24]:
food_5['energy_100g'].fillna(food_5['energy_100g'].mean(), inplace=True)
food_5['fat_100g'].fillna(food_5['fat_100g'].mean(), inplace=True)
food_5['carbohydrates_100g'].fillna(food_5['carbohydrates_100g'].mean(), inplace=True)
food_5['sugars_100g'].fillna(food_5['sugars_100g'].mean(), inplace=True)
food_5['vitamin-a_100g'].fillna(food_5['vitamin-a_100g'].mean(), inplace=True)
food_5['vitamin-c_100g'].fillna(food_5['vitamin-c_100g'].mean(), inplace=True)
food_5['calcium_100g'].fillna(food_5['calcium_100g'].mean(), inplace=True)
food_5['iron_100g'].fillna(food_5['iron_100g'].mean(), inplace=True)
food_5['salt_100g'].fillna(food_5['salt_100g'].mean(), inplace=True)
food_5['proteins_100g'].fillna(food_5['proteins_100g'].mean(), inplace=True)

In [25]:
food_5.isnull().sum().sum()

75365

In [26]:
food_5['nutrition-score-fr_100g'].fillna(food_5['nutrition-score-fr_100g'].median(), inplace=True)

In [27]:
food_5.isnull().sum().sum()

38744

remplace la valeur manquante de nutrition_grade_fr via la relation entre nutrition_grade_fr et nutrition-score-fr_100g : 
    a < 0, 0<=b < 3, 3<=c <=10, 10 < d < 20, e >= 20. 

In [28]:
def condition(x):
    if x < 0:
        return "a"
    elif x>=0 and x<3:
        return "b"
    elif x>=3 and x<=10:
        return "c"
    elif x>10 and x<20:
        return "d"
    else:
        return "e"
food_5['nutrition_grade_fr'] = food_5['nutrition-score-fr_100g'].apply(condition)

In [29]:
food_5 = food_5.drop(['nutrition-score-fr_100g'], axis = 1)

In [30]:
food_5.isnull().sum().sum()

2123

In [31]:
food_5.to_csv('food_5.csv')

In [32]:
food_5.shape

(209386, 16)

6: nettoyage:   convertir valeur serving_size en numérique : garder les individus dont serving_size contient 'g' 'ml' ou 'G' et extrait la valeur numérique pour uniformiser la valeur en gramme, écarter les individus qui ne contiennet pas 'g', 'G' ou 'ml'. 

In [33]:
food_6 = food_5

In [34]:
i = 0
j = len(food_6) - 1
while i < j: 
# convertir le content en mots séparés,  prendre le premier mot comme valeur en gramme.
    mylist = food_6['serving_size'].iloc[i].split(" ")
    if len(mylist) > 1:
        if mylist[1] == 'g' or mylist[1] == 'ml' or mylist[1] == 'G' : 
            food_6['serving_size'].iloc[i] = mylist[0]
            i+=1
        else: 
            food_6 = food_6.drop(index=food_6.iloc[i].name)
            j-=1
    else: 
        food_6 = food_6.drop(index=food_6.iloc[i].name)
        j-=1
    

In [35]:
food_6.shape

(199562, 16)

In [36]:
food_6.to_csv('food_6.csv')

7:  traitement des outliers

In [37]:
Q1 = food_6.quantile(0.25)
Q3 = food_6.quantile(0.75)
IQR = Q3 - Q1

# Remove outliers
food_7 = food_6[~((food_6 < (Q1 - 1.5 * IQR)) |(food_6 > (Q3 + 1.5 * IQR))).any(axis=1)]

In [38]:
food_7.shape

(145582, 16)

In [39]:
food_7.duplicated().sum()

0

In [40]:
food_7.notna().sum()/len(food_7)

code                  1.000000
product_name          1.000000
brands                0.989525
countries             0.999876
serving_size          1.000000
nutrition_grade_fr    1.000000
energy_100g           1.000000
fat_100g              1.000000
carbohydrates_100g    1.000000
sugars_100g           1.000000
proteins_100g         1.000000
salt_100g             1.000000
vitamin-a_100g        1.000000
vitamin-c_100g        1.000000
calcium_100g          1.000000
iron_100g             1.000000
dtype: float64

In [44]:
food_7.to_csv('food_7.csv')

In [42]:
food_8 = pd.read_csv('food_7.csv')

In [43]:
def conditions(y):
    asia = [ "Thailand","Phlippine","India","Indonesia","en:CH","Singapore","Chine","Hong Kong","Japon"]
    europe = ["Fran","Espa","Spain","Germany","land","Allemagne","Belg","Portugal","Pol","Norway", "Austria", \
              "UK","Royaume-uni","Ita","uisse","witzerland","Pays-Bas","United Kingdom",  \
              "en:DE","en:FR", "en:ES", "en:GB","en:IT","en:PT","en:EU", "en:BE","Danemark", "European"]
    america = ["US","en:US","en:CA","United States","Bra","Canada","USA","xico","Chil"]

    if any([x in y for x in asia]):
        return "Asia"
    elif any([x in y for x in europe]):
        return "Europe"
    elif any([x in y for x in america]): 
        return "America"
    else: 
        return "World"
    
food_8['continent'] = food_8['countries'].apply(conditions)

TypeError: argument of type 'float' is not iterable

In [None]:
food_8.to_csv('food_8.csv')