In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Preprocessing our Data

In [2]:
df = pd.read_csv('food_recipes.csv')

In [3]:
df.head()


Unnamed: 0,recipe_title,url,record_health,vote_count,rating,description,cuisine,course,diet,prep_time,cook_time,ingredients,instructions,author,tags,category
0,Roasted Peppers And Mushroom Tortilla Pizza Re...,https://www.archanaskitchen.com/roasted-pepper...,good,434,4.958525,is a quicker version pizza to satisfy your cr...,Mexican,Dinner,Vegetarian,15 M,15 M,Tortillas|Extra Virgin Olive Oil|Garlic|Mozzar...,To begin making the Roasted Peppers And Mushro...,Divya Shivaraman,Party Food Recipes|Tea Party Recipes|Mushroom ...,Pizza Recipes
1,Thakkali Gotsu Recipe | Thakkali Curry | Spicy...,https://www.archanaskitchen.com/tomato-gotsu-r...,good,3423,4.932223,also known as the is a quick and easy to ma...,South Indian Recipes,Lunch,Vegetarian,10 M,20 M,Sesame (Gingelly) Oil|Mustard seeds (Rai/ Kadu...,To begin making Tomato Gotsu Recipe/ Thakkali ...,Archana Doshi,Vegetarian Recipes|Tomato Recipes|South Indian...,Indian Curry Recipes
2,Spicy Grilled Pineapple Salsa Recipe,https://www.archanaskitchen.com/spicy-grilled-...,good,2091,4.945959,Spicy Grilled Pineapple Salsa is a simple reci...,Mexican,Side Dish,Vegetarian,10 M,0 M,Extra Virgin Olive Oil|Pineapple|White onion|R...,To begin making the Spicy Grilled Pineapple Sa...,Archana's Kitchen,Party Starter & Appetizer Recipes|Pineapple Re...,Mexican Recipes
3,Karwar Style Dali Thoy Recipe - Toor dal Curry,https://www.archanaskitchen.com/dali-thoy-reci...,good,990,4.888889,The is a quintessential of Konkani dish whic...,Coastal Karnataka,Side Dish,High Protein Vegetarian,5 M,20 M,Arhar dal (Split Toor Dal)|Turmeric powder (Ha...,To prepare Karwar Style Dali Thoy Recipe (Toor...,Jyothi Rajesh,Side Dish Recipes|South Indian Recipes|Indian ...,Indian Curry Recipes
4,Rajma Kofta In Milk And Poppy Seed Gravy Recipe,https://www.archanaskitchen.com/rajma-kofta-in...,good,345,4.828986,Koftas are traditional Indian recipes mostly w...,North Indian Recipes,Side Dish,High Protein Vegetarian,20 M,30 M,Rajma (Large Kidney Beans)|Cashew nuts|Sultana...,To begin making Rajma Kofta In Milk And Poppy ...,RUBY PATHAK,Side Dish Recipes|Indian Lunch Recipes|Office ...,Kofta Recipes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8009 entries, 0 to 8008
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   recipe_title   8009 non-null   object 
 1   url            8009 non-null   object 
 2   record_health  8009 non-null   object 
 3   vote_count     8009 non-null   int64  
 4   rating         8009 non-null   float64
 5   description    7994 non-null   object 
 6   cuisine        7943 non-null   object 
 7   course         7854 non-null   object 
 8   diet           7858 non-null   object 
 9   prep_time      7979 non-null   object 
 10  cook_time      7979 non-null   object 
 11  ingredients    7997 non-null   object 
 12  instructions   8009 non-null   object 
 13  author         8009 non-null   object 
 14  tags           7930 non-null   object 
 15  category       8009 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 1001.3+ KB


In [5]:
df.describe()

Unnamed: 0,vote_count,rating
count,8009.0,8009.0
mean,2268.004495,4.888621
std,3683.15657,0.077467
min,15.0,3.175705
25%,494.0,4.865031
50%,1050.0,4.900553
75%,2487.0,4.93
max,80628.0,5.0


In [6]:
print(df.isnull().sum())

recipe_title       0
url                0
record_health      0
vote_count         0
rating             0
description       15
cuisine           66
course           155
diet             151
prep_time         30
cook_time         30
ingredients       12
instructions       0
author             0
tags              79
category           0
dtype: int64


In [7]:
#Columns with a high percentage of missing values can sometimes be dropped, especially if they are not crucial to your analysis or model.
threshold = 0.5
df = df.loc[:, df.isnull().mean() < threshold]


In [8]:
print(df.isnull().sum())
df.dropna(subset=['description', 'cuisine', 'course', 'diet', 'prep_time', 'cook_time', 'ingredients', 'tags'], inplace=True)

recipe_title       0
url                0
record_health      0
vote_count         0
rating             0
description       15
cuisine           66
course           155
diet             151
prep_time         30
cook_time         30
ingredients       12
instructions       0
author             0
tags              79
category           0
dtype: int64


In [9]:
# Assuming df is the DataFrame already loaded with the recipe data

# Step 1: Clean the 'prep_time' and 'cook_time' columns
def convert_to_numeric(value):
    try:
        return int(value.replace('M', '').strip())
    except (ValueError, AttributeError):
        return np.nan

df['prep_time'] = df['prep_time'].apply(convert_to_numeric)
df['cook_time'] = df['cook_time'].apply(convert_to_numeric)

# Step 2: Fill missing values for the cleaned columns with the mean
df['prep_time'].fillna(df['prep_time'].mean(), inplace=True)
df['cook_time'].fillna(df['cook_time'].mean(), inplace=True)

# Fill missing values for other columns as before
df['tags'].fillna(df['tags'].mode()[0], inplace=True)
df['description'].fillna("", inplace=True)
df['ingredients'].fillna("", inplace=True)


In [10]:
# Fill missing values for categorical columns with the mode
df['cuisine'].fillna(df['cuisine'].mode()[0], inplace=True)
df['course'].fillna(df['course'].mode()[0], inplace=True)
df['diet'].fillna(df['diet'].mode()[0], inplace=True)
df['tags'].fillna(df['tags'].mode()[0], inplace=True)

# Fill missing values for numerical columns with the mean
df['prep_time'].fillna(df['prep_time'].mean(), inplace=True)
df['cook_time'].fillna(df['cook_time'].mean(), inplace=True)

# Fill missing values in description and ingredients with an empty string
df['description'].fillna('', inplace=True)
df['ingredients'].fillna('', inplace=True)


In [11]:
print(df.isnull().sum())


recipe_title     0
url              0
record_health    0
vote_count       0
rating           0
description      0
cuisine          0
course           0
diet             0
prep_time        0
cook_time        0
ingredients      0
instructions     0
author           0
tags             0
category         0
dtype: int64
