In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import re

In [3]:
# Step 2: Load the dataset
df = pd.read_csv('Mc.Donalds_menu.csv')

In [4]:
df.shape

(260, 24)

In [5]:
df.head(5)

Unnamed: 0,Category,Item,Serving Size,Calories,Calories from Fat,Total Fat,Total Fat (% Daily Value),Saturated Fat,Saturated Fat (% Daily Value),Trans Fat,...,Carbohydrates,Carbohydrates (% Daily Value),Dietary Fiber,Dietary Fiber (% Daily Value),Sugars,Protein,Vitamin A (% Daily Value),Vitamin C (% Daily Value),Calcium (% Daily Value),Iron (% Daily Value)
0,Breakfast,Egg McMuffin,4.8 oz (136 g),300,120,13.0,20,5.0,25,0.0,...,31,10,4,17,3,17,10,0,25,15
1,Breakfast,Egg White Delight,4.8 oz (135 g),250,70,8.0,12,3.0,15,0.0,...,30,10,4,17,3,18,6,0,25,8
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),370,200,23.0,35,8.0,42,0.0,...,29,10,4,17,2,14,8,0,25,10
3,Breakfast,Sausage McMuffin with Egg,5.7 oz (161 g),450,250,28.0,43,10.0,52,0.0,...,30,10,4,17,2,21,15,0,30,15
4,Breakfast,Sausage McMuffin with Egg Whites,5.7 oz (161 g),400,210,23.0,35,8.0,42,0.0,...,30,10,4,17,2,21,6,0,25,10


In [6]:
df.columns

Index(['Category', 'Item', 'Serving Size', 'Calories', 'Calories from Fat',
       'Total Fat', 'Total Fat (% Daily Value)', 'Saturated Fat',
       'Saturated Fat (% Daily Value)', 'Trans Fat', 'Cholesterol',
       'Cholesterol (% Daily Value)', 'Sodium', 'Sodium (% Daily Value)',
       'Carbohydrates', 'Carbohydrates (% Daily Value)', 'Dietary Fiber',
       'Dietary Fiber (% Daily Value)', 'Sugars', 'Protein',
       'Vitamin A (% Daily Value)', 'Vitamin C (% Daily Value)',
       'Calcium (% Daily Value)', 'Iron (% Daily Value)'],
      dtype='object')

In [7]:
def extract_grams_extended(serving):
    serving = str(serving)

    # Try to extract grams
    match_g = re.search(r'(\d+)\s*g', serving)
    if match_g:
        return int(match_g.group(1))

    # Try to extract fl oz
    match_oz = re.search(r'(\d+)\s*fl\s*oz', serving)
    if match_oz:
        fl_oz = int(match_oz.group(1))
        return int(fl_oz * 29.57)  # Convert to grams approx.

    # Try to extract ml
    match_ml = re.search(r'(\d+)\s*ml', serving)
    if match_ml:
        return int(match_ml.group(1))  # Assume 1 ml ≈ 1 g

    return None


In [8]:
df['Serving Size (g)'] = df['Serving Size'].apply(extract_grams_extended)
print(df[['Serving Size', 'Serving Size (g)']].head())
print("Missing values in 'Serving Size (g)':", df['Serving Size (g)'].isnull().sum())

     Serving Size  Serving Size (g)
0  4.8 oz (136 g)               136
1  4.8 oz (135 g)               135
2  3.9 oz (111 g)               111
3  5.7 oz (161 g)               161
4  5.7 oz (161 g)               161
Missing values in 'Serving Size (g)': 0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Category                       260 non-null    object 
 1   Item                           260 non-null    object 
 2   Serving Size                   260 non-null    object 
 3   Calories                       260 non-null    int64  
 4   Calories from Fat              260 non-null    int64  
 5   Total Fat                      260 non-null    float64
 6   Total Fat (% Daily Value)      260 non-null    int64  
 7   Saturated Fat                  260 non-null    float64
 8   Saturated Fat (% Daily Value)  260 non-null    int64  
 9   Trans Fat                      260 non-null    float64
 10  Cholesterol                    260 non-null    int64  
 11  Cholesterol (% Daily Value)    260 non-null    int64  
 12  Sodium                         260 non-null    int

In [10]:
print("Column Data Types:\n")
print(df.dtypes)

Column Data Types:

Category                          object
Item                              object
Serving Size                      object
Calories                           int64
Calories from Fat                  int64
Total Fat                        float64
Total Fat (% Daily Value)          int64
Saturated Fat                    float64
Saturated Fat (% Daily Value)      int64
Trans Fat                        float64
Cholesterol                        int64
Cholesterol (% Daily Value)        int64
Sodium                             int64
Sodium (% Daily Value)             int64
Carbohydrates                      int64
Carbohydrates (% Daily Value)      int64
Dietary Fiber                      int64
Dietary Fiber (% Daily Value)      int64
Sugars                             int64
Protein                            int64
Vitamin A (% Daily Value)          int64
Vitamin C (% Daily Value)          int64
Calcium (% Daily Value)            int64
Iron (% Daily Value)               in

In [11]:
correlations = df.corr(numeric_only=True)['Calories'].drop('Calories')

# Sort by absolute correlation
correlations_sorted = correlations.reindex(correlations.abs().sort_values(ascending=False).index)

# Print results
print("Feature Importance Based on Correlation with 'Calories':\n")
for feature, corr in correlations_sorted.items():
    print(f"{feature:<30} | Correlation: {corr:.4f}")

Feature Importance Based on Correlation with 'Calories':

Calories from Fat              | Correlation: 0.9046
Total Fat                      | Correlation: 0.9044
Total Fat (% Daily Value)      | Correlation: 0.9041
Saturated Fat (% Daily Value)  | Correlation: 0.8476
Saturated Fat                  | Correlation: 0.8456
Protein                        | Correlation: 0.7878
Carbohydrates                  | Correlation: 0.7815
Carbohydrates (% Daily Value)  | Correlation: 0.7812
Sodium (% Daily Value)         | Correlation: 0.7134
Sodium                         | Correlation: 0.7123
Iron (% Daily Value)           | Correlation: 0.6436
Cholesterol                    | Correlation: 0.5964
Cholesterol (% Daily Value)    | Correlation: 0.5952
Dietary Fiber (% Daily Value)  | Correlation: 0.5400
Dietary Fiber                  | Correlation: 0.5389
Trans Fat                      | Correlation: 0.5224
Calcium (% Daily Value)        | Correlation: 0.4284
Sugars                         | Correlat