<a href="https://colab.research.google.com/github/rishimae/ml_kusinaiready/blob/main/ml_kusinaiready.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**# Prep**

In [1]:
import pandas as pd


# Load the dataset from a GitHub raw URL
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/dishes_dataset.csv'  # Replace with your actual raw URL
df = pd.read_csv(file_url)


# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


# Check for missing values
print("Missing values:\n", df.isnull().sum())


# Clean 'dishID': Ensure it's an integer
df['dishid'] = pd.to_numeric(df['dishid'], errors='coerce').astype('Int64')


# Clean 'dishname': Strip whitespace and standardize case
df['dishname'] = df['dishname'].str.strip().str.title()


# Clean 'prep_time': Convert to numeric
df['prep_time'] = pd.to_numeric(df['prep_time'], errors='coerce')


# Clean 'ingre_list': Strip whitespace, sort ingredients, and join
df['ingre_list'] = df['ingre_list'].apply(lambda x: ', '.join(sorted([ingredient.strip() for ingredient in x.split(',')])))


# Clean 'num_servings': Extract lower and upper bounds
df[['num_servings_min', 'num_servings_max']] = df['num_servings'].str.split('-', expand=True)
df['num_servings_min'] = df['num_servings_min'].astype(int)
df['num_servings_max'] = df['num_servings_max'].astype(int)


# Clean 'nutri_guide': Ensure consistent formatting (optional)
df['nutri_guide'] = df['nutri_guide'].str.strip()


# Clean 'skills_needed': Strip whitespace and standardize case
df['skills_needed'] = df['skills_needed'].str.strip().str.title()


# Define unique age groups
unique_age_groups = ['Kids', 'Teens', 'Adults', 'Elders']


# Create binary columns for each age group
for age_group in unique_age_groups:
  df[f'age_{age_group.lower()}'] = df['age_range'].apply(lambda x: 1 if age_group in x else 0)


# Define a list of all possible meal types
all_meal_types = ['Appetizer', 'Soup', 'Vegetable Dishes', 'Vegetable with Seafood', 'Vegetable with Meat']


# Create binary columns for each meal type
for meal in all_meal_types:
  df[f'meal_{meal.lower().replace(" ", "_")}'] = df['meal_type'].apply(lambda x: 1 if meal in x else 0)


# Drop the original meal_type column if you no longer need it
df.drop(columns=['meal_type'], inplace=True)


# Optionally drop the original 'age_range' and 'meal_type' columns if not needed
df.drop(columns=['age_range'], inplace=True)


df.drop(columns=['num_servings'], inplace=True)


# Display the cleaned dataset
print("\nCleaned dataset:\n", df)


# Save the cleaned dataset to a new CSV file
cleaned_file_path = 'cleaned_dishes.csv'  # Specify the desired output file name
df.to_csv(cleaned_file_path, index=False)





Missing values:
 dishid           0
dishname         0
prep_time        0
ingre_list       0
num_servings     0
nutri_guide      0
skills_needed    0
age_range        0
meal_type        0
dtype: int64

Cleaned dataset:
     dishid                  dishname  prep_time  \
0      101     Grilled Chicken Salad         30   
1      102       Spaghetti Bolognese         45   
2      103       Vegetarian Stir Fry         20   
3      104                Beef Tacos         25   
4      105         Pancake Breakfast         15   
5      106     Chicken Alfredo Pasta         40   
6      107              Quinoa Salad         20   
7      108  Bbq Pulled Pork Sandwich         35   
8      109         Vegetable Lasagna         60   
9      110                  Omelette         15   
10     111      Fruit Yogurt Parfait         10   
11     112          Mushroom Risotto         45   
12     113   Grilled Cheese Sandwich         10   
13     114         Tomato Basil Soup         25   
14     115     