In [None]:
# !pip install -r ../requirements.txt

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../data/final_preprocessed_data_points.csv')
df.head(2)

In [None]:
# Getting the columns
df.columns, df.shape

In [None]:
columns_of_interest = ['energy-kj_100g','sugars_100g', 'salt_100g', 'saturated-fat_100g','proteins_100g', 'fiber_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'sodium_100g']
df1 = df[columns_of_interest]
y = df['nutrition-score-fr_100g']
df1.head(2)

In [None]:
correlation_matrix = df1.corr()

# Create a heatmap to visualize the correlations
plt.figure(figsize=(5, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Heatmap of Nutritional Values')
plt.show()

In [None]:
def get_top_correlations_blog(df, threshold=0.3):
    """
    df: the dataframe to get correlations from
    threshold: the maximum and minimum value to include for correlations. 
    For eg, if this is 0.4, only pairs haveing a correlation coefficient greater than 0.4 
    or less than -0.4 will be included in the results. 
    """
    orig_corr = df.corr()
    c = orig_corr.abs()

    so = c.unstack()
    
    i=0
    pairs=set()
    result = pd.DataFrame()
    for index, value in so.sort_values(ascending=False).iteritems():
        # Exclude duplicates and self-correlations
        if value > threshold \
        and index[0] != index[1] \
        and (index[0], index[1]) not in pairs \
        and (index[1], index[0]) not in pairs:
            
            # print(f'|    {index[0]}    |    {index[1]}    |    {orig_corr.loc[(index[0], index[1])]}    |')
            result.loc[i, ['Variable 1', 'Variable 2', 'Correlation']] = [index[0], index[1], orig_corr.loc[(index[0], index[1])]]
            pairs.add((index[0], index[1]))
            i+=1
    return result.reset_index(drop=True).set_index(['Variable 1', 'Variable 2'])

In [None]:
df1.head()

In [None]:
top_corr = get_top_correlations_blog(df1)
top_corr

In [None]:
# view the distribution and correlation of those strong features by each class

sns.pairplot(df1)
plt.figure(figsize = (10,8))
plt.show()

Some observations:
* Either salt or sodium can be omitted (as it is fully correlated (100%))
* Energy and fats are half correlated (50%)
* Fibers and proteins are half correlated (50%)
* Energy and proteins are half correlated (46%)

Our decisions:
* Ignore salt and include sodium
* Ignore energy (due to correlation with proteins and fats)
* Maximise 1/2(fibers+proteins) as they are correlated
* Assign weight of fruits-vegetables-nuts as 1, as it has weak correlations with all the other variables