In [ ]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
import shap
from ydata_profiling import ProfileReport
import sweetviz as sv

Reproducibility:

In [ ]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

In [ ]:
diet = pd.read_csv('diet.csv', low_memory=False)
requests = pd.read_csv('requests.csv', low_memory=False)
reviews = pd.read_csv('reviews.csv', low_memory=False)
reviews['Rating'].unique()

Changing the object data types

In [ ]:
diet['Diet'] = diet['Diet'].astype('category')
requests['HighProtein'] = requests['HighProtein'].astype('category')
requests['LowSugar'] = requests['LowSugar'].astype('category')

Data Joining using common attributes

In [ ]:
author_ID = 'AuthorId'
merged_diet_requests = pd.merge(diet, requests, on=author_ID, how='left')
merged_diet_requests

In [ ]:
merged_diet_requests.to_csv('merged_diet_requests.csv')

Impute the missing values

In [ ]:
merged_diet_requests['Diet'] = merged_diet_requests['Diet'].fillna('Vegetarian')
merged_diet_requests_cleaned = merged_diet_requests.dropna(subset=['RecipeId'])
merged_diet_requests_cleaned

Merge the file with reviews column to include 'likes' column

In [ ]:
merged_data = merged_diet_requests_cleaned.merge(reviews, how='inner', on='AuthorId')
merged_data

Data Visualization

Diet

In [ ]:
# load the data
file_path = "diet.csv"
df = pd.read_csv(file_path)

In [ ]:
# have a look at the data and its attributes
print(df.sample(10))

In [ ]:
# get a general overview over data, check for missing values, etc.
print(df.info())

In [ ]:
# have a look at common statistics of the dataset
print(df.describe())
sns.boxplot(df);

In [ ]:
# check the balancing of classes/labels
print(df.groupby("Diet").size())

In [ ]:
# have a look at the feature distributions with a pairplot,
# as it gives you a good overview over possible outliers
# and a good overview over the data in general

# pairplot for the full data
sns.pairplot(df, hue="Diet", diag_kind="hist", diag_kws={"multiple" : "stack"});

In [ ]:
# look at class-dependent pairplots
df_grouped_by_class = df.groupby(by="Diet")

df_omnivore = df_grouped_by_class.get_group("Omnivore")
df_vegetarian = df_grouped_by_class.get_group("Vegetarian")
df_vegan = df_grouped_by_class.get_group("Vegan")

class_labels = {
    "Omnivore" : {
        "color" : "blue",
        "data" : df_omnivore
    },
    "Vegetarian" : {
        "color" : "green",
        "data" : df_vegetarian
    },
    "Vegan" : {
        "color" : "red",
        "data" : df_vegan
    }
}

for class_i in class_labels:
    class_color = class_labels[class_i]["color"]
    class_df = class_labels[class_i]["data"]
    p = sns.pairplot(class_df, diag_kind="hist", diag_kws={"color" : class_color}, plot_kws={"color" : class_color, "label" : class_i})
    p.fig.suptitle(class_i, y=1.0, size=15)

In [ ]:
# We can also leverage the dataprep package to get a nice summary report
report = sv.analyze(df)
report.show_notebook()

# We can also leverage the yadata_profiling package to get a nice summary report
profile = ProfileReport(df, title="Diet Data - Summary Report")
profile

Random Forest Classifier

In [ ]:
# Step 1: Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [ ]:
# Step 2: Load cleaned data
def load_cleaned_data():
    # Replace this function with your data loading and cleaning logic
    # Example assumes X contains features and y contains labels
    data = pd.read_csv('merged_diet_requests.csv')
    X = data.drop('target_column', axis=1)
    y = data['target_column']
    return X, y

X, y = load_cleaned_data()
