In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/salemprakash/EDA/main/Data/indian_food.csv"
df = pd.read_csv(url)

# Load an additional dataset to merge (example: another CSV with additional food details)
url_additional = "https://raw.githubusercontent.com/salemprakash/EDA/main/Data/indian_food.csv"
df_additional = pd.read_csv(url_additional)

# Display first few rows
print("Original Data:")
print(df.head())

Original Data:
             name                                        ingredients  \
0      Balu shahi                    Maida flour, yogurt, oil, sugar   
1          Boondi                            Gram flour, ghee, sugar   
2  Gajar ka halwa       Carrots, milk, sugar, ghee, cashews, raisins   
3          Ghevar  Flour, ghee, kewra, milk, clarified butter, su...   
4     Gulab jamun  Milk powder, plain flour, baking powder, ghee,...   

         diet  prep_time  cook_time flavor_profile   course        state  \
0  vegetarian         45         25          sweet  dessert  West Bengal   
1  vegetarian         80         30          sweet  dessert    Rajasthan   
2  vegetarian         15         60          sweet  dessert       Punjab   
3  vegetarian         15         30          sweet  dessert    Rajasthan   
4  vegetarian         15         40          sweet  dessert  West Bengal   

  region  
0   East  
1   West  
2  North  
3   West  
4   East  


In [None]:
# Drop duplicate rows
df = df.drop_duplicates()

# Fill missing values with appropriate values
df.fillna({"state": "Unknown", "diet": "Unknown"}, inplace=True)

# Convert columns to lowercase
df.columns = df.columns.str.lower()
df_additional.columns = df_additional.columns.str.lower()

# Remove leading/trailing spaces in string columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df_additional = df_additional.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Convert preparation_time and cook_time to integers (replace 'NA' with 0)
df['prep_time'] = pd.to_numeric(df['prep_time'], errors='coerce').fillna(0).astype(int)
df['cook_time'] = pd.to_numeric(df['cook_time'], errors='coerce').fillna(0).astype(int)

# Add a new column for total cooking time
df['total_time'] = df['prep_time'] + df['cook_time']

# Standardize diet column (e.g., Veg / Non-Veg)
df['diet'] = df['diet'].str.capitalize()

# Concatenate 'name' and 'state' columns into a new column 'dish_location'
df['dish_location'] = df['name'] + " - " + df['course']

# Save transformed data to CSV
df.to_csv("transformed_indian_food.csv", index=False)

# Display transformed data
print("\nTransformed Data:")
print(df.head())


Transformed Data:
             name                                        ingredients  \
0      Balu shahi                    Maida flour, yogurt, oil, sugar   
1          Boondi                            Gram flour, ghee, sugar   
2  Gajar ka halwa       Carrots, milk, sugar, ghee, cashews, raisins   
3          Ghevar  Flour, ghee, kewra, milk, clarified butter, su...   
4     Gulab jamun  Milk powder, plain flour, baking powder, ghee,...   

         diet  prep_time  cook_time flavor_profile   course        state  \
0  Vegetarian         45         25          sweet  dessert  West Bengal   
1  Vegetarian         80         30          sweet  dessert    Rajasthan   
2  Vegetarian         15         60          sweet  dessert       Punjab   
3  Vegetarian         15         30          sweet  dessert    Rajasthan   
4  Vegetarian         15         40          sweet  dessert  West Bengal   

  region  total_time             dish_location  
0   East          70      Balu shahi - des

In [None]:
# Merge with additional dataset on 'name' column
df_merged = pd.merge(df, df_additional, on='name', how='left')

# Save transformed data to CSV
df_merged.to_csv("transformed_indian_food.csv", index=False)

# Display transformed data
print("\nTransformed Data:")
print(df_merged.head())


Transformed Data:
             name                                      ingredients_x  \
0      Balu shahi                    Maida flour, yogurt, oil, sugar   
1          Boondi                            Gram flour, ghee, sugar   
2  Gajar ka halwa       Carrots, milk, sugar, ghee, cashews, raisins   
3          Ghevar  Flour, ghee, kewra, milk, clarified butter, su...   
4     Gulab jamun  Milk powder, plain flour, baking powder, ghee,...   

       diet_x  prep_time_x  cook_time_x flavor_profile_x course_x  \
0  Vegetarian           45           25            sweet  dessert   
1  Vegetarian           80           30            sweet  dessert   
2  Vegetarian           15           60            sweet  dessert   
3  Vegetarian           15           30            sweet  dessert   
4  Vegetarian           15           40            sweet  dessert   

       state_x region_x  total_time             dish_location  \
0  West Bengal     East          70      Balu shahi - dessert   
1  

In [None]:
df['state'].value_counts()

Unnamed: 0_level_0,count
state,Unnamed: 1_level_1
Gujarat,35
Punjab,32
Maharashtra,30
West Bengal,24
-1,24
Assam,21
Tamil Nadu,20
Andhra Pradesh,10
Uttar Pradesh,9
Kerala,8


In [None]:
df.describe()

Unnamed: 0,prep_time,cook_time,total_time
count,255.0,255.0,255.0
mean,31.105882,34.529412,65.635294
std,72.554409,48.26565,92.752636
min,-1.0,-1.0,-2.0
25%,10.0,20.0,30.0
50%,10.0,30.0,50.0
75%,20.0,40.0,60.0
max,500.0,720.0,730.0


In [None]:
# Selecting relevant columns
df_subset = df[['name', 'prep_time', 'cook_time', 'region']]

# Creating a hierarchical index with 'region' as the first level and 'name' as the second
df_subset.set_index(['region', 'name'], inplace=True)

# Display the reshaped DataFrame
print(df_subset.head())

                       prep_time  cook_time
region name                                
East   Balu shahi             45         25
West   Boondi                 80         30
North  Gajar ka halwa         15         60
West   Ghevar                 15         30
East   Gulab jamun            15         40


In [None]:
stacked = df_subset.stack()
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
region,name,Unnamed: 2_level_1,Unnamed: 3_level_1
East,Balu shahi,prep_time,45
East,Balu shahi,cook_time,25
West,Boondi,prep_time,80
West,Boondi,cook_time,30
North,Gajar ka halwa,prep_time,15
North,...,...,...
North,Shufta,cook_time,-1
Central,Mawa Bati,prep_time,20
Central,Mawa Bati,cook_time,45
West,Pinaca,prep_time,-1


In [None]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,prep_time,cook_time
region,name,Unnamed: 2_level_1,Unnamed: 3_level_1
,Panjeeri,10,25
,Tandoori Chicken,240,30
-1,Brown Rice,15,25
-1,Kaju katli,10,20
-1,Kheer,10,40
...,...,...,...
West,Turiya Patra Vatana sabji,35,40
West,Undhiyu,25,60
West,Veg Kolhapuri,20,30
West,Vindaloo,10,40


In [None]:
# Creating two Series: one for prep_time and one for cook_time
series_prep = df.set_index('name')['prep_time']
series_cook = df.set_index('name')['cook_time']

# Concatenating them with hierarchical indexing
frame_combined = pd.concat([series_prep, series_cook], keys=['Prep Time', 'Cook Time'])

# Unstacking the DataFrame
result_unstacked = frame_combined.unstack()

# Display the transformed DataFrame
print(result_unstacked.head())


name       Balu shahi  Boondi  Gajar ka halwa  Ghevar  Gulab jamun  Imarti  \
Prep Time          45      80              15      15           15      10   
Cook Time          25      30              60      30           40      50   

name       Jalebi  Kaju katli  Kalakand  Kheer  ...  Pani Pitha  Payokh  \
Prep Time      10          10        20     10  ...          10      -1   
Cook Time      50          20        30     40  ...          20      -1   

name       Prawn malai curry  Red Rice  Shukto  Til Pitha  Bebinca  Shufta  \
Prep Time                 15        -1      10          5       20      -1   
Cook Time                 50        -1      20         30       60      -1   

name       Mawa Bati  Pinaca  
Prep Time         20      -1  
Cook Time         45      -1  

[2 rows x 255 columns]


In [None]:
# Creating a sample DataFrame similar to the example but using the dataset
frame3 = df[['name', 'prep_time']].copy()

# Introducing some duplicate rows for demonstration
frame3 = pd.concat([frame3, frame3.iloc[:3]], ignore_index=True)

print("Before Deduplication:")
print(frame3.head(10))

# Removing duplicates
frame3_deduplicated = frame3.drop_duplicates()

print("\nAfter Deduplication:")
print(frame3_deduplicated.head(10))

Before Deduplication:
             name  prep_time
0      Balu shahi         45
1          Boondi         80
2  Gajar ka halwa         15
3          Ghevar         15
4     Gulab jamun         15
5          Imarti         10
6          Jalebi         10
7      Kaju katli         10
8        Kalakand         20
9           Kheer         10

After Deduplication:
             name  prep_time
0      Balu shahi         45
1          Boondi         80
2  Gajar ka halwa         15
3          Ghevar         15
4     Gulab jamun         15
5          Imarti         10
6          Jalebi         10
7      Kaju katli         10
8        Kalakand         20
9           Kheer         10


In [None]:
frame3_deduplicated.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
250,False
251,False
252,False
253,False


In [None]:
bins = [118, 125, 135, 160, 200]
category = pd.cut(df['cook_time'], bins)
category

Unnamed: 0,cook_time
0,
1,
2,
3,
4,
...,...
250,
251,
252,
253,


In [None]:
pd.value_counts(category)

  pd.value_counts(category)


Unnamed: 0_level_0,count
cook_time,Unnamed: 1_level_1
"(118, 125]",3
"(125, 135]",0
"(135, 160]",0
"(160, 200]",0
