In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

import seaborn as sns

In [2]:
def correlation_graph(pca, 
                      x_y, 
                      features) : 
    """Affiche le graphe des correlations

    Positional arguments : 
    -----------------------------------
    pca : sklearn.decomposition.PCA : notre objet PCA qui a été fit
    x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2
    features : list ou tuple : la liste des features (ie des dimensions) à représenter
    """

    # Extrait x et y 
    x,y=x_y

    # Taille de l'image (en inches)
    fig, ax = plt.subplots(figsize=(10, 9))

    # Pour chaque composante : 
    for i in range(0, pca.components_.shape[1]):

        # Les flèches
        ax.arrow(0,0, 
                pca.components_[x, i],  
                pca.components_[y, i],  
                head_width=0.07,
                head_length=0.07, 
                width=0.02, )

        # Les labels
        plt.text(pca.components_[x, i] + 0.05,
                pca.components_[y, i] + 0.05,
                features[i])
        
    # Affichage des lignes horizontales et verticales
    plt.plot([-1, 1], [0, 0], color='grey', ls='--')
    plt.plot([0, 0], [-1, 1], color='grey', ls='--')

    # Nom des axes, avec le pourcentage d'inertie expliqué
    plt.xlabel('F{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x],1)))
    plt.ylabel('F{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y],1)))

    # J'ai copié collé le code sans le lire
    plt.title("Cercle des corrélations (F{} et F{})".format(x+1, y+1))

    # Le cercle 
    an = np.linspace(0, 2 * np.pi, 100)
    plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale

    # Axes et display
    plt.axis('equal')
    plt.show(block=False)

## 8. ACP & clustering
----- Data cleaning and preparation

In [3]:
### 8.1. Imports
_subdata_nutrigrades_notNaN=pd.read_csv("/Users/souha_kassab/OC_project2/static/csv/20230107_cleandata_for_nutriscore.csv", low_memory=False)

In [4]:
### 8.2. Data preparation
print( 'Data shape is', _subdata_nutrigrades_notNaN.shape) 
subdata_nutrigrades_notNaN.info

Data shape is (111213, 13)


<bound method DataFrame.info of         Unnamed: 0  additives_n  energy_100g  saturated_fat_100g  sugars_100g  \
0                3          2.0       1833.0                4.69        15.62   
1                4          1.0       2230.0                5.00         3.33   
2                9          1.0       2092.0                6.67        30.00   
3               11          2.0       2372.0                3.33         6.67   
4               12          3.0       1954.0                2.22        33.33   
...            ...          ...          ...                 ...          ...   
111208      221194          0.0        444.0                0.00         0.00   
111209      221197          1.0       2111.0                3.80         5.30   
111210      221198          1.0        660.0                0.50        16.70   
111211      221200          5.0       1031.0                1.28         0.10   
111212      221202          1.0       1393.0                2.78        30.56

In [5]:
# Missing and Duplicated values
subdata_nutrigrades_notNaN.isna().sum()
#_subdata_nutrigrades_notNaN.duplicated().sum()

Unnamed: 0                    0
additives_n                 549
energy_100g                   4
saturated_fat_100g            4
sugars_100g                   4
fiber_100g                  435
proteins_100g                 4
salt_100g                     4
sodium_100g                   4
nutrition_score_fr_100g       0
nutrition_grade_fr            0
additives_tags             1037
product_name                  0
dtype: int64

In [6]:
subdata_nutrigrades_notNaN.columns
subdata_nutrigrades_notNaN.rename(columns = {'Unnamed: 0':'Original_Index'}, inplace = True)

NameError: name 'subdata_nutrigrades_notNaN' is not defined

In [None]:
# Data description
print("data's unique values", _subdata_nutrigrades_notNaN.nunique())
subdata_nutrigrades_notNaN.describe(include='all')

In [None]:
# select some columns to work on20230102_cleandata_for_nutriscore
_subdata_nutrigrades_notNaN.dropna(inplace=True)

In [None]:
# Gerer les outliers!!!

### 8.4. Data separation 
Nous allons ensuite séparer nos données. D'un coté X la matrice des données et nos colonnes (features) dans une variable features :  : 

In [None]:
X = _subdata_nutrigrades_notNaN.values
X[:5]
print(X.shape)

In [None]:
components = _subdata_nutrigrades_notNaN.columns
components

### 8.5. PCA

In [None]:
#### Scaler instanciation
scaler = StandardScaler()

In [None]:
X[0, 1:7]

In [None]:
#### Scale & Transform
X_scaled = scaler.fit_transform(X[:, 1:7])
idx = ["mean", "std"]
pd.DataFrame(X_scaled).describe().round(2).loc[idx, :]

In [None]:
n_components = 6
pca = PCA(n_components=n_components)

In [None]:
pca.fit(X_scaled)

In [None]:
pca.explained_variance_ratio_

In [None]:
scree = (pca.explained_variance_ratio_*100).round(2)
scree

In [None]:
scree_cum = scree.cumsum().round()
scree_cum

In [None]:
x_list = range(1, n_components+1)
list(x_list)

In [None]:
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)

#### Doing PCA using only nutritive components 

In [None]:
XX = X[:, 2:7]

In [None]:
#### Scale & Transform
X_scaled = scaler.fit_transform(XX)
idx = ["mean", "std"]
pd.DataFrame(X_scaled).describe().round(2).loc[idx, :]

In [None]:
# taking first 6 components
n_components = 5
pca = PCA(n_components)

In [None]:
pca?

In [None]:
pca.fit(X_scaled)

In [None]:
pca.explained_variance_ratio_

In [None]:
scree = (pca.explained_variance_ratio_*100).round(2)
scree

In [None]:
scree_cum = scree.cumsum().round()
scree_cum

In [None]:
x_list = range(1, n_components+1)
list(x_list)

In [None]:
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)

In [None]:
x_y = (0,1)

In [None]:
# correlation of F1 and F2
correlation_graph(pca, x_y, components[2:8])

On peut deja voir que les trois composantes des points negatives sont fortement correlees a F1, F1 peut etre vu comme l'ensemble des elements defavorables du score nutrtionnel
tandis que F2 est l'ensemble des elements favorables du score nutritionnel auquel sont fortement correlees les proteines et les fibres.
On s'abstient pour ce cerle d'interpreter les petites fleches!(sels)

In [None]:
# correlation of F3 and F4
correlation_graph(pca, (2,3), components)

In [None]:
### Components

In [None]:
### 8.6 Projection
X_proj = pca.transform(X_scaled)
X_proj[:5]

___________________________________________________________________________________________

In [None]:
from urllib import request
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# Function to compute median value per variable for the 5 grades of Nutriscore (a,b,c,d,e)
def save_centralTendencies(_nutriscore_df):
    # Calcul medianne par categorie de nutriscore et type de composant 
    dico_mediane = {_nutriscore_df.columns[var_composant]: _nutriscore_df.iloc[:, var_composant].median()
                     for var_composant in range(1, 12)}
    return dico_mediane

In [None]:
_subdata_nutrigrades_notNaN.isna().mean()

In [None]:
_subdata_nutrigrades_notNaN.describe()

In [None]:
# Projecting a variable on a table
effectifs = _subdata_nutrigrades_notNaN["sugars_100g"].value_counts()
modalites = effectifs.index # l'index de effectifs contient les modalités

tab = pd.DataFrame(modalites, columns = ["sugar_quantity"]) # création du tableau à partir des modalités
tab["n"] = effectifs.values
tab["f"] = tab["n"] / len(_subdata_nutrigrades_notNaN) # len(data) renvoie la taille de l'échantillon

display(tab)

In [None]:
tab[tab['sugar_quantity']>100]['n'].sum()

In [None]:
_subdata_nutrigrades_notNaN[_subdata_nutrigrades_notNaN.sugars_100g > 0]

In [None]:
# Diagramme de dispersion
_subdata_nutrigrades_notNaN = _subdata_nutrigrades_notNaN[_subdata_nutrigrades_notNaN.sugars_100g > 0]
plt.plot(_subdata_nutrigrades_notNaN["nutrition_score_fr_100g"],_subdata_nutrigrades_notNaN["sugars_100g"],'o')
plt.xlabel("Nutriscore (fr)")
plt.ylabel("Sugar quantity")
plt.ylim(0, 100)
plt.show()


We can notice the low amount of sugar in product with good nutrigrade (nutriscore =-15 >> -5, where sugar quatity is <20g. we see a concentration of sugar values >80g between nutriscore +10 >> 30, whilst it diminishes for nutriscores >30g. The reason for this can be the initial data collection (type of products collected contains more "average" nutriscore products initially) or a possible correlation between bad nutriscore and another component than sugar. We can actually distinguish two tendencies, one of an increasing linearity between sugar quantity and nutriscore, up until a "inflexion point" around nutriscore = 15 where the tendency is rather to a decreasing linear relation

In [None]:
import numpy as np

In [None]:
taille_classe = 5
groupes = [] # va recevoir les données agrégées à afficher
#for nutriVal in _subdata_nutrigrades_notNaN['nutrition_score_fr_100g'].unique():
    
# on calcule des tranches allant de -15 au nutriscore maximum 
tranches = np.arange(-15, 40, taille_classe)

tranches += int(taille_classe/2) # on décale les tranches d'une demi taille de classe
indices = np.digitize(_subdata_nutrigrades_notNaN["nutrition_score_fr_100g"], tranches) # associe chaque solde à son numéro de classe  

In [None]:
 # sélection des individus de la tranche ind
del nutriscore

In [None]:
#for nutriVal in _subdata_nutrigrades_notNaN['nutrition_score_fr_100g'].unique():
for ind, tr in enumerate(tranches): 
    print (ind, tr)
    sugar_qtity = _subdata_nutrigrades_notNaN.loc[indices==ind,"sugars_100g"]
    if len(sugar_qtity) > 0:
        g = {
            'valeurs':sugar_qtity,
            'centre_classe': tr-(taille_classe/2),
            'taille': len(sugar_qtity),
            'quartiles': [np.percentile(sugar_qtity,p) for p in [25,50,75]]
        }
        groupes.append(g)

In [None]:
plt.figure(figsize=(10,7))
        
# affichage des boxplots

plt.boxplot([g["valeurs"] for g in groupes],
            positions= [g["centre_classe"] for g in groupes], # abscisses des boxplots
            showfliers= False, # on ne prend pas en compte les outliers
            widths= taille_classe*0.7) # largeur graphique des boxplots

# affichage des effectifs de chaque classe
for g in groupes:
    plt.text(g["centre_classe"],+90,"(n={})".format(g["taille"]),horizontalalignment='center',verticalalignment='top')     
plt.show()

en affichant les effectifs, il parait en effet que les produits de nutriscore plus eleve (lower nutrigrade) sont plus presents dans notre echantillon que le reste.--> besoin de normaliser?

In [None]:
import scipy.stats as st

In [None]:
print(st.pearsonr(_subdata_nutrigrades_notNaN['nutrition_score_fr_100g'],_subdata_nutrigrades_notNaN['sugars_100g'])[0])
print(np.cov(_subdata_nutrigrades_notNaN['nutrition_score_fr_100g'],_subdata_nutrigrades_notNaN['sugars_100g'],ddof=0)[1,0])

Le coefficient de Pearson revele une bonne correlation entre la qutite de sucre et le nutriscore. Une relation lineaire d'ensemble semble se degager

In [None]:
print('Skewness of the nutriscore values: ',_subdata_nutrigrades_notNaN['nutrition_score_fr_100g'].skew())
print('Skewness of sugar quantity values: ', _subdata_nutrigrades_notNaN['sugars_100g'].skew())

In [None]:
_subdata_nutrigrades_notNaN['nutrition_score_fr_100g']

### Regression lineaire

In [None]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
setSize = int(len(_subdata_nutrigrades_notNaN['sugars_100g'])/2)

In [None]:
sugar_qtity_train = _subdata_nutrigrades_notNaN.loc[:setSize, ['sugars_100g']]
#sugar_qtity_train = np.array(sugar_qtity_train).reshape(-1, 1)
sugar_qtity_test = _subdata_nutrigrades_notNaN.loc[setSize:, ['sugars_100g']]

# Split the targets into training/testing sets
nutriscore_train = _subdata_nutrigrades_notNaN.loc[:setSize,['nutrition_score_fr_100g']]
nutriscore_test = _subdata_nutrigrades_notNaN.loc[setSize:, ['nutrition_score_fr_100g']]

# Create linear regression object
regr = LinearRegression()

In [None]:
np.newaxis?

In [None]:
# Train the model using the training sets
regr.fit(sugar_qtity_train, nutriscore_train)

# Make predictions using the testing set
nutriscore_pred = regr.predict(sugar_qtity_test)

In [None]:
# Plot outputs
plt.scatter(sugar_qtity_train, nutriscore_train, color="black", linewidth = 1)
plt.plot(sugar_qtity_test, nutriscore_pred, color="blue", linewidth=3)
plt.xlim (0, 100)
plt.ylim(-15, 40)
plt.show()

In [None]:
plt.scatter

In [None]:
# Train the model using the training sets
regr.fit(sugar_qtity_train, nutriscore_train)

# Make predictions using the testing set
nutriscore_pred = regr.predict(sugar_qtity_test)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
#print("Mean squared error: %.2f" % reg.get_params (nutriscore_test, nutriscore_pred))
# The coefficient of determination: 1 is perfect prediction
#print("Coefficient of determination: %.2f" % (nutriscore_test, nutriscore_pred))

Need to drop the outlier and try again

In [None]:
x_train, x_test, y_train, y_test = train_test_split(_subdata_nutrigrades_notNaN.loc[:, ['sugars_100g']], _subdata_nutrigrades_notNaN.loc[:, ['nutrition_score_fr_100g']] )

In [None]:
Lr = LinearRegression()
Lr.fit(x_train.values.reshape(-1, 1), y_train.values)

In [None]:
plt.scatter(x_train, y_train, label = 'Training data', color = 'red', alpha=.7)
plt.scatter (x_test, y_test, label ='Testing data', color ='g', alpha=.7)
plt.legend()
plt.title('Test Train Split')
plt.show()

In [None]:
prediction = Lr.predict(x_test.values.reshape(-1, 1))
plt.plot(x_test, prediction, label ='Linear Regression', color ='b')
plt.scatter(x_test, y_test, label ='Test data', color ='g', alpha=.7)
plt.legend()
plt.show()

In [None]:
Lr.score(x_test.values.reshape(-1, 1), y_test.values)

In [None]:
Lr.get_params(deep=True)

We try to drop the outliers to see how and if score changes: slight increase in r2 from 0.27 to 0.28

In [None]:
#Sugar outlier 
iOutlier_sugar =_subdata_nutrigrades_notNaN[_subdata_nutrigrades_notNaN.sugars_100g>150]
iOutlier_sugar
_subdata_nutrigrades_notNaN.drop(iOutlier_sugar.index, axis=0, inplace=True)

#for iNeg in range(NegativeVal_sugar).unique():
i=0
NegativeVal_sugar =_subdata_openfood_quantitativeVal[_subdata_openfood_quantitativeVal.sugars_100g<0]

for iNeg in NegativeVal_sugar['nutrigrade_ABCDE'].unique():
    #i +=1
    NegativeVal_sugar.loc[NegativeVal_sugar['nutrigrade_ABCDE']==iNeg,
                      'sugars_100g'] =  grandDicodesMedianes[iNeg]['sugars_100g'] 
for index, value in zip(list(NegativeVal_sugar['sugars_100g'].index), list(NegativeVal_sugar['sugars_100g'].values)):
    _subdata_openfood_quantitativeVal.loc[index,'sugars_100g'] = value

In [None]:
x_train, x_test, y_train, y_test = train_test_split(_subdata_nutrigrades_notNaN.loc[:, ['sugars_100g']], _subdata_nutrigrades_notNaN.loc[:, ['nutrition_score_fr_100g']] )


In [None]:
# Diagramme en secteurs
_subdata_openfood_quantitativeVal["nutrigrade_ABCDE"].value_counts(normalize=True).plot(kind='pie')
#Cette ligne assure que le pie chart est un cercle plutôt qu'une éllipse
plt.axis('equal') 
plt.show() # Affiche le graphique

#plt.plot(_subdata_with_nutriscore_notNaN["nutrition_score_fr_100g"])
# Diagramme en tuyaux d'orgues
#_subdata_openfood_quantitativeVal["nutrigrade_ABCDE"].value_counts(normalize=True).plot(kind='bar')
#plt.show()

In [None]:
liste_composantes_191Nan = [data_openfood.sodium_100g, data_openfood.salt_100g, data_openfood.sugars_100g, data_openfood]

In [None]:
effectifs = data_openfood['nom_produit'].value_counts()
modalities = effectifs.index

#### On estime ainsi que le calcul du nutriscore est valide

In [None]:
count_nutrigrade = _subdata_openfood_quantitativeVal['nutrigrade_ABCDE'].value_counts()
ax = sns.barplot(x=count_additives.index, y = count_nutrigrade.values)
ax.bar_label(ax.containers[0])

In [None]:
plt.figure(figsize=(10,10))

count_additives = (_subdata_openfood_quantitativeVal['additives_n'].value_counts())
ax = sns.barplot(x=count_additives.index, y = count_additives.values)
#ax.bar_label(ax.contanuiners[0])

In [None]:
plt.figure(figsize=(10,5))
additives_qtity = _subdata_openfood_quantitativeVal['additives_n'].value_counts().index
#additives_qtity
sns.countplot(y='additives_n', order =additives_qtity , data=_subdata_openfood_quantitativeVal)

 After first filtering and sub-slicing, can't drop duplicated values anymore!!!

In [None]:
_subdata_openfood_quantitativeVal["nutrition_score_fr_100g"].hist(density=True)
plt.show()

In [None]:
i_categories= data_openfood.categories.value_counts

In [None]:
del splitted_categories

In [None]:
type(categories)

In [None]:
L = ['A', 'B', 'A,B']
L2 = []
for i in L:
    L2.extend(i.split(','))
L2

In [None]:
ListOfCategories = list(i_categories.index)
splitted_ListOfCategories = []
for categ in ListOfCategories:
    splitted_ListOfCategories.extend(categ.split(','))
#sorted(splitted_ListOfCategories)
sorted_ListOfCategories =sorted([ind.strip() for ind in splitted_ListOfCategories]) 
splitted_ListOfCategories = sorted(list(set(sorted_ListOfCategories)))
del categ
print (len(splitted_ListOfCategories))

In [None]:
splitted_ListOfCategories =sorted([ind.strip() for ind in splitted_ListOfCategories]) 

In [None]:
len(sorted(list(set(sorted_ListOfCategories))))

In [None]:
count_nutrigrade = _subdata_openfood_quantitativeVal['nutrigrade_ABCDE'].value_counts()
ax = sns.barplot(x=count_additives.index, y = count_nutrigrade.values)
ax.bar_label(ax.containers[0])

In [None]:
with sns.axes_style('white'):
    sns.jointplot(data=_subdata_openfood_quantitativeVal, x= _subdata_openfood_quantitativeVal['nutrigrade_ABCDE'].values_count(), y= _subdata_openfood_quantitativeVal['additives_n'], kind='reg')

In [None]:
_subdata_openfood_quantitativeVal.pivot_table('product_name', index='nutrigrade_ABCDE', columns = 'additives_n', aggfunc='sum')

In [None]:
sns.pairplot?

In [None]:
list(reversed(range(1,11)))

In [None]:
my_list = [2, 'apple', 3.5]
my_list

In [None]:
my_list[1] = 'orange'
my_list

In [None]:
y = "stuff;thing;junk"
z=y.split(';')
len(z)

In [None]:
a=np.array([1,2,3])
a

In [None]:
b = np.array([4,5,6])
c=a*b
c

In [None]:
d=np.dot(a,b)
d

Regression lineaire

In [None]:
# Pour selectionner des donnees aleatoires dans notre base de donnees en vue de creer le training set
data_size= len(df_nutrisc_sucre)
sample = np.random.randint(data_size, size=int(data_size*0.1) )
sampledData_for_train = df_nutrisc_sucre.iloc[sample]

In [None]:
sampledData_for_train['sugars_100g']
X = np.matrix([np.ones(sampledData_for_train['sugars_100g'].shape[0]), sampledData_for_train['sugars_100g']]).T
y = np.matrix(sampledData_for_train.nutrition_score_fr_100g).T
y[:10]

In [None]:
# On effectue le calcul exact du paramètre theta
theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

print(theta)

In [None]:
plt.xlabel('Sugar quantity')
plt.ylabel(' Nutriscore')

plt.plot( df_nutrisc_sucre['sugars_100g'], df_nutrisc_sucre['nutrition_score_fr_100g'],
         'ro', markersize=4)

# On affiche la droite entre 0 et 250 (en surface)
plt.plot([0,100], [theta.item(0), theta.item(0) + 20 * theta.item(1)], linestyle='--', c='#000000')

plt.show()

In [None]:
from sklearn import linear_model
x_train, x_test, y_train, y_test = train_test_split(df_nutriscore_components['sugars_100g'], df_nutriscore_components['nutrition_score_fr_100g'], train_size=0.8)        
regr = linear_model.LinearRegression()
regr.fit(x_train.values.reshape(-1, 1), y_train.values)
regr.predict(x_test.values.reshape(-1, 1))
regr.get_params(deep=True)

In [None]:
regr.intercept_

In [None]:
FirstReg_score = regr.score(x_test.values.reshape(-1, 1), y_test.values)
FirstReg_score