In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
import shap
# from ydata_profiling import ProfileReport
import sweetviz as sv

Reproducibility:

In [ ]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

Data Understanding

In [ ]:
diet = pd.read_csv('diet.csv', low_memory=False)
diet['Diet'] = diet['Diet'].astype('category')
diet

In [ ]:
#Box Plot of Ages by Diet: Explore relationships between numerical variables (Age) using a pair plot.
plt.figure(figsize=(12, 8))
sns.boxplot(x='Diet', y='Age', data=diet, palette='pastel')
plt.title('Box Plot of Ages by Diet')
plt.xlabel('Diet')
plt.ylabel('Age')
plt.show()

In [ ]:
requests = pd.read_csv('requests.csv', low_memory=False)
requests['HighProtein'] = requests['HighProtein'].astype('category')
requests['LowSugar'] = requests['LowSugar'].astype('category')
requests

In [ ]:
#Correlation Heatmap: Visualize the correlation between numerical variables.
# Exclude non-numeric columns
numeric_columns = requests[['Time', 'HighCalories', 'LowFat', 'HighFiber']]
correlation_matrix = numeric_columns.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Request Variables')
plt.show()

In [ ]:
reviews = pd.read_csv('reviews.csv', low_memory=False)
reviews

In [ ]:
#Scatter Plot of Rating vs Like:Investigate the relationship between ratings and likes using a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(reviews['Rating'], reviews['Like'], alpha=0.5, color='green')
plt.title('Scatter Plot of Rating vs Like')
plt.xlabel('Rating')
plt.ylabel('Like')
plt.show()

In [ ]:
recipes = pd.read_csv('recipes.csv', low_memory=False)
recipes.rename(columns={
    'Name': 'RecipeName'
}, inplace=True)
recipes

In [ ]:
#Histogram for Calories:Explore the distribution of calories in recipes using a histogram.
custom_bins = np.arange(0, 5100, 200)  # Adjust the range and interval as needed

plt.figure(figsize=(10, 6))
plt.hist(recipes['Calories'], bins=custom_bins, color='skyblue', edgecolor='black')
plt.title('Distribution of Calories in Recipes')
plt.xlabel('Calories')
plt.ylabel('Frequency')

# Set x-axis labels
plt.xticks(custom_bins)

plt.show()

In [ ]:
#Bar Plot for Recipe Categories: Visualize the distribution of recipes across different categories using a bar plot.
plt.figure(figsize=(14, 8))
sns.boxplot(x='RecipeCategory', y='CookTime', data=recipes, palette='Set2')
plt.title('Box Plot of Cook Time by Recipe Category')
plt.xlabel('Recipe Category')
plt.ylabel('Cook Time')
plt.xticks(rotation=45, ha='right')
plt.show()

In [ ]:
merged_request_review = pd.merge(reviews,requests,on=['AuthorId','RecipeId'])
merged_request_review

In [ ]:
#eatmap for Correlation: Visualize the correlation between numerical variables.
correlation_matrix = merged_request_review[['Rating', 'Like', 'Time', 'HighCalories', 'LowFat', 'HighFiber']].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Merged Data')
plt.show()

Data Joining using common attributes

In [ ]:
merged_recipes_req_review= pd.merge(merged_request_review,recipes,on=['RecipeId'],how='right')
merged_recipes_req_review

In [ ]:
author_ID = 'AuthorId'
merged_diet_all = pd.merge(diet, merged_recipes_req_review, on=author_ID)
# merged_request_recipes = pd.merge(requests, recipes, on='RecipeId', how='left')
merged_diet_all

Impute the missing values

In [ ]:
# 1 value missing in diet column. Filled with most occuring value.
merged_diet_all['Diet'] = merged_diet_all['Diet'].fillna('Vegetarian')

In [ ]:
#impute the values for all dietary preferences for all ages with the most frequent RecipeId for that age in that category
helper_df = merged_diet_all.groupby(['Age', 'Diet'])['RecipeId'].agg(lambda x: x.mode()[0]).reset_index()
helper_df.columns = ['Age', 'Diet', 'Most Common Recipe']
def impute_recipe(row):
    if pd.isnull(row['RecipeId']):
        return helper_df[(helper_df['Age'] == row['Age']) & (helper_df['Diet'] == row['Diet'])]['Most Common Recipe'].values[0]
    else:
        return row['RecipeId']
merged_diet_all['RecipeId'] = merged_diet_all.apply(impute_recipe, axis=1)
merged_diet_all

In [ ]:
# Fill the rest of the missing values in the merged_diet_all by mapping them from requests.csv with RecipeId as key 
# Create mapping DataFrames from `requests`
map_time = requests.set_index('RecipeId')['Time'].to_dict()
map_calories = requests.set_index('RecipeId')['HighCalories'].to_dict()
map_protein = requests.set_index('RecipeId')['HighProtein'].to_dict()
map_fat = requests.set_index('RecipeId')['LowFat'].to_dict()
map_sugar = requests.set_index('RecipeId')['LowSugar'].to_dict()
map_fiber = requests.set_index('RecipeId')['HighFiber'].to_dict()

# Apply mapping to `merged_diet_all`
merged_diet_all['Time'] = merged_diet_all['RecipeId'].map(map_time)
merged_diet_all['HighCalories'] = merged_diet_all['RecipeId'].map(map_calories)
merged_diet_all['HighProtein'] = merged_diet_all['RecipeId'].map(map_protein)
merged_diet_all['LowFat'] = merged_diet_all['RecipeId'].map(map_fat)
merged_diet_all['LowSugar'] = merged_diet_all['RecipeId'].map(map_sugar)
merged_diet_all['HighFiber'] = merged_diet_all['RecipeId'].map(map_fiber)
merged_diet_all    

In [ ]:
map_name = recipes.set_index('RecipeId')['RecipeName'].to_dict()
map_cook_time = recipes.set_index('RecipeId')['CookTime'].to_dict()
map_prep_time = recipes.set_index('RecipeId')['PrepTime'].to_dict()
map_category = recipes.set_index('RecipeId')['RecipeCategory'].to_dict()
map_quantities = recipes.set_index('RecipeId')['RecipeIngredientQuantities'].to_dict()
map_parts = recipes.set_index('RecipeId')['RecipeIngredientParts'].to_dict()
map_calories = recipes.set_index('RecipeId')['Calories'].to_dict()
map_fat_content = recipes.set_index('RecipeId')['FatContent'].to_dict()
map_saturated_content = recipes.set_index('RecipeId')['SaturatedFatContent'].to_dict()
map_cholesterol = recipes.set_index('RecipeId')['CholesterolContent'].to_dict()
map_sodium = recipes.set_index('RecipeId')['SodiumContent'].to_dict()
map_carbohydrate = recipes.set_index('RecipeId')['CarbohydrateContent'].to_dict()
map_fiber = recipes.set_index('RecipeId')['FiberContent'].to_dict()
map_sugar = recipes.set_index('RecipeId')['SugarContent'].to_dict()
map_protein = recipes.set_index('RecipeId')['ProteinContent'].to_dict()
map_servings = recipes.set_index('RecipeId')['RecipeServings'].to_dict()
map_yield = recipes.set_index('RecipeId')['RecipeYield'].to_dict()


# Apply mapping to `merged_diet_all`
merged_diet_all['RecipeName'] = merged_diet_all['RecipeId'].map(map_name)
merged_diet_all['CookTime'] = merged_diet_all['RecipeId'].map(map_cook_time)
merged_diet_all['PrepTime'] = merged_diet_all['RecipeId'].map(map_prep_time)
merged_diet_all['RecipeCategory'] = merged_diet_all['RecipeId'].map(map_category)
merged_diet_all['RecipeIngredientQuantities'] = merged_diet_all['RecipeId'].map(map_quantities)
merged_diet_all['RecipeIngredientParts'] = merged_diet_all['RecipeId'].map(map_parts)
merged_diet_all['Calories'] = merged_diet_all['RecipeId'].map(map_calories)
merged_diet_all['FatContent'] = merged_diet_all['RecipeId'].map(map_fat)
merged_diet_all['SaturatedFatContent'] = merged_diet_all['RecipeId'].map(map_saturated_content)
merged_diet_all['CholesterolContent'] = merged_diet_all['RecipeId'].map(map_cholesterol)
merged_diet_all['SodiumContent'] = merged_diet_all['RecipeId'].map(map_sodium)
merged_diet_all['CarbohydrateContent'] = merged_diet_all['RecipeId'].map(map_carbohydrate)
merged_diet_all['FiberContent'] = merged_diet_all['RecipeId'].map(map_fiber)
merged_diet_all['SugarContent'] = merged_diet_all['RecipeId'].map(map_sugar)
merged_diet_all['ProteinContent'] = merged_diet_all['RecipeId'].map(map_protein)
merged_diet_all['RecipeServings'] = merged_diet_all['RecipeId'].map(map_servings)
merged_diet_all['RecipeYield'] = merged_diet_all['RecipeId'].map(map_yield)

In [ ]:
merged_diet_all['Rating'] = merged_diet_all['Rating'].fillna(0)
merged_diet_all['TestSetId'] = merged_diet_all['TestSetId']
merged_diet_all

Balance the proportion of True and False

In [ ]:
# Assuming df is your DataFrame
df = merged_diet_all
# Calculate the proportion of true and false values in the 'Like' column
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
# Calculate the proportion of true and false values in the 'Like' column
proportion_true = df['Like'].mean()
proportion_false = 1 - proportion_true

# Identify the indices of false values
false_indices = df[df['Like'] == False].index

# Randomly sample false indices to achieve a balanced proportion
num_false_to_sample = int(df['Like'].value_counts()[True] / proportion_true) - df['Like'].value_counts()[False]
indices_to_sample = np.random.choice(false_indices, size=num_false_to_sample, replace=False)

# Create a new DataFrame with sampled false values
df_balanced = pd.concat([df[df['Like'] == True], df.loc[indices_to_sample]])

# Shuffle the new DataFrame to randomize the order
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Print the proportion of true and false values in the balanced DataFrame
print(df_balanced['Like'].value_counts(normalize=True))

Train a Gradient Boosting Classifier

In [ ]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score
# 72.7 WITH 2000 ADD
# Load your data
df = df_balanced

# Define features (X) and target variable (y)
features = ['Diet', 'Age', 'Rating', 'Time', 'HighCalories','HighProtein','LowFat','LowSugar','HighFiber', 'CookTime', 'PrepTime','RecipeCategory', 'RecipeCategory', 'Calories', 'FatContent','SaturatedFatContent', 'CholesterolContent', 'SodiumContent',  'CarbohydrateContent','FiberContent', 'SugarContent', 'ProteinContent']


target = 'Like'

# Drop rows with NaN values in the target column ('Like') and create a copy for training set
train_data = df.dropna(subset=[target]).copy()

# Convert 'Like' column to boolean (if not already)
train_data['Like'] = train_data['Like'].astype(bool)

# Split the data into features (X) and target variable (y)
X = train_data[features]
y = train_data['Like']

# Handle categorical variables using one-hot encoding
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting model
model = GradientBoostingClassifier(
    n_estimators=100,     # You can adjust this
    random_state=42,
    learning_rate=0.1,    # You can adjust this
    max_depth=3,          # You can adjust this
    min_samples_split=2,  # You can adjust this
    min_samples_leaf=1,   # You can adjust this
    subsample=1.0         # You can adjust this
)


# Fit the model on the training set
model.fit(X_train, y_train)

# Evaluate the model on the training set
y_train_pred = model.predict(X_train)

# Compute balanced accuracy for the training set
balanced_accuracy_train = balanced_accuracy_score(y_train, y_train_pred)
print(f"Balanced Accuracy on Training Set: {balanced_accuracy_train:.4f}")

# Predict the 'Like' column for the testing set
y_test_pred = model.predict(X_test)

# Compute balanced accuracy for the testing set
balanced_accuracy_test = balanced_accuracy_score(y_test, y_test_pred)
print(f"Balanced Accuracy on Testing Set: {balanced_accuracy_test:.4f}")

Training a Random Forest Classifier

In [ ]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from itertools import combinations

# Load your data
df = df_balanced

# Define features (X) and target variable (y)
features = ['Diet','Age', 'Rating', 'Time', 'CookTime', 'FiberContent', 'ProteinContent', 'SugarContent',  'SodiumContent', 'CholesterolContent', 'SaturatedFatContent', 'FatContent', 'Calories', 'HighFiber', 'HighCalories', 'RecipeCategory', 'HighProtein']
target = 'Like'

# Drop rows with NaN values in the target column ('Like') and create a copy for training set
train_data = df.dropna(subset=[target]).copy()

# Convert 'Like' column to boolean (if not already)
train_data['Like'] = train_data['Like'].astype(bool)

# Get all possible combinations of features


best_combination = None
best_accuracy = 0.0

# Iterate over all feature combinations
# Split the data into features (X) and target variable (y)
X = train_data[features]
y = train_data['Like']

# Handle categorical variables using one-hot encodin
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model with hyperparameter tuning
model_final_rfc = RandomForestClassifier(
    n_estimators=100,
    random_state=47,
    max_depth=10,
    min_samples_split=6,
    min_samples_leaf=5,
    max_features='sqrt'
)

# Fit the model on the training set
model_final_rfc.fit(X_train, y_train)

# Evaluate the model on the testing set
y_test_pred = model_final_rfc.predict(X_test)
balanced_accuracy_train = balanced_accuracy_score(y_train, y_train_pred)
print(f"Balanced Accuracy on Training Set: {balanced_accuracy_train:.4f}")
# Compute balanced accuracy for the testing set
balanced_accuracy_test = balanced_accuracy_score(y_test, y_test_pred)

# Print the feature combination and its balanced accuracy
# print(f"Balanced Accuracy on Training Set: {balanced_accuracy_test:.4f}")

print(f"Balanced Accuracy on Testing Set: {balanced_accuracy_test:.4f}")

Deploy the model

In [ ]:
features = ['Diet','Age', 'Rating', 'Time', 'CookTime', 'FiberContent', 'ProteinContent', 'SugarContent',  'SodiumContent', 'CholesterolContent', 'SaturatedFatContent', 'FatContent', 'Calories', 'HighFiber', 'HighCalories', 'RecipeCategory', 'HighProtein','TestSetId','Like']
df = merged_diet_all[features].copy()

# Drop and save testsetid
# Drop rows with NaN values in the target column ('Like') for training set
predict_data = df[df['Like'].isna()]
predict_data

predict_data = predict_data.sort_values(by='TestSetId', ascending=True)
predict_data
# 1. One-Hot Encode Categorical Variables in predict_data
predict_features = predict_data[features]  # 'features' list from your model training code
predict_features_encoded = pd.get_dummies(predict_features)

# 2. Align the Columns of predict_data with X_train
# Add missing columns in predict_features_encoded with value equal to 0
missing_cols = set(X_train.columns) - set(predict_features_encoded.columns)
for c in missing_cols:
    predict_features_encoded[c] = 0

# Ensure the order of columns in predict_features_encoded matches that of X_train
predict_features_encoded = predict_features_encoded[X_train.columns]

# 3. Make Predictions Using the Trained Model
predicted_likes = model_final_rfc.predict(predict_features_encoded)

# Combine Predictions with 'TestSetId'
results_df = pd.DataFrame({
    'id': predict_data['TestSetId'],
    'prediction': predicted_likes
})
results_df['id'] = results_df['id'].astype(int)
results_df['prediction'] = results_df['prediction'].replace({True: 1, False: 0})


# Display the first few rows of the results
print(results_df.head())
results_df.to_csv('predictions_skilled_shark.csv',index=False)
results_df