# Data Prediction on the food receipes and ratings reviews
Prediction rating on the time of the months 

**Name(s)**: Su Aye 

**Website Link**: https://suaye07.github.io/Prediction-of-recipes-and-ratings/

## Code

In [143]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
pd.options.plotting.backend = 'plotly'
from tabulate import tabulate
import plotly.io as pio
pio.renderers.default = 'iframe'

### Cleaning

In [144]:
raw_interactions = pd.read_csv("RAW_interactions.csv")
raw_receipes = pd.read_csv("RAW_recipes.csv")

In [145]:
raw_interactions.shape

(731927, 5)

In [146]:
raw_receipes.shape

(83782, 12)

In [151]:
#combine two datasets raw_interactions and raw_receipes 
combined = pd.merge(raw_interactions, raw_receipes, left_on="recipe_id", right_on="id").drop('id', axis=1)
combined['rating'] = combined['rating'].apply(lambda x: np.nan if x == "0" else x)
combined.dropna(subset=['rating'], inplace=True)

In [152]:
combined_cleaned = combined.dropna(subset=['rating'])
average_rating_per_recipe = combined_cleaned.groupby(['recipe_id', 'name'])['rating'].mean()

In [153]:
cleaned_df = pd.merge(combined, average_rating_per_recipe, left_on="recipe_id", 
                      right_on="recipe_id")
cleaned_df = cleaned_df.drop('rating_x', axis=1)
cleaned_df = cleaned_df.rename(columns={'rating_y': 'mean_rating'})
cleaned_df = cleaned_df[~cleaned_df['mean_rating'].apply(lambda x: pd.isnull(x))]
cleaned_df = cleaned_df.drop_duplicates(subset='user_id')

In [154]:
cleaned_df = cleaned_df.dropna(subset=None, how='any')
cleaned_df['binary_rating'] = cleaned_df['mean_rating'].apply(lambda x: 0 if x <= 3 else 1)

### Baseline Model

In [130]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

In [103]:
cleaned_df = cleaned_df.drop(columns=['user_id','recipe_id','date','review','name',
                                      'contributor_id','submitted','nutrition',
                                      'steps','description','ingredients',
                                      'mean_rating'])
cleaned_df.dropna(subset=['minutes'], inplace=True)
cleaned_df.dropna(subset=['tags'], inplace=True)
cleaned_df.dropna(subset=['n_ingredients'], inplace=True)
cleaned_df.dropna(subset=['binary_rating'], inplace=True)
cleaned_df = cleaned_df.reset_index(drop=True)
cleaned_df

Unnamed: 0,minutes,tags,n_steps,n_ingredients,binary_rating
0,40,"['60-minutes-or-less', 'time-to-make', 'course...",4,8,1
1,30,"['30-minutes-or-less', 'time-to-make', 'course...",9,10,1
2,22,"['30-minutes-or-less', 'time-to-make', 'course...",14,14,1
3,22,"['30-minutes-or-less', 'time-to-make', 'course...",14,14,1
4,40,"['60-minutes-or-less', 'time-to-make', 'course...",7,12,1
...,...,...,...,...,...
67176,280,"['main-ingredient', 'preparation', 'for-1-or-2...",15,27,0
67177,280,"['main-ingredient', 'preparation', 'for-1-or-2...",15,27,0
67178,5,"['15-minutes-or-less', 'time-to-make', 'course...",4,6,1
67179,5,"['15-minutes-or-less', 'time-to-make', 'course...",4,6,1


In [13]:
X = cleaned_df.drop(columns=['binary_rating'])
y = cleaned_df['binary_rating']

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), ['n_steps','minutes']),
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('rfc', RandomForestClassifier())
])

In [14]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['n_steps', 'minutes'])])),
                ('rfc', RandomForestClassifier())])

In [15]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8613529805760214


### Final Model

In [110]:
X = cleaned_df.drop(columns=['binary_rating'])
y = cleaned_df['binary_rating']

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42)

In [120]:
preproc = ColumnTransformer([
    ("cap_gains_pass", QuantileTransformer(), ['n_steps','minutes','n_ingredients']),
    ("one-hot", OneHotEncoder(handle_unknown = 'ignore'), ['tags'])
])
pipeline = Pipeline([
    ('preprocesser', preproc),
    ('rfc', RandomForestClassifier())
])

In [121]:
params= {
    'rfc__n_estimators': [10, 20],  
    'rfc__max_depth': [None, 5],
    'rfc__min_samples_split': [2, 5]  
}
searcher = GridSearchCV(pl, params, cv=5, n_jobs=-1)
searcher.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocesser',
                                        ColumnTransformer(transformers=[('cap_gains_pass',
                                                                         QuantileTransformer(),
                                                                         ['n_steps',
                                                                          'minutes',
                                                                          'n_ingredients']),
                                                                        ('one-hot',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['tags'])])),
                                       ('rfc', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rfc__max_depth': [None, 5],
                         'rfc__min_sa

In [122]:
pipeline.fit(X_train, y_train)
final_accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {final_accuracy}")

Accuracy: 0.930043908610553


In [114]:
searcher.best_params_

{'rfc__max_depth': None, 'rfc__min_samples_split': 2, 'rfc__n_estimators': 20}

### Fairness Analysis

In [124]:
binary_count = cleaned_df['binary_rating'].value_counts()
df = pd.DataFrame(binary_count)
binary_df = df.rename(columns={'binary_rating': 'Count'})
binary_df

Unnamed: 0,Count
1,57495
0,9686


In [142]:
fig = go.Figure(data=[go.Pie(labels=binary_df.index, values=binary_df['Count'])])
fig.update_layout(title='Count of Binary Ratings<br>(1 being rating{4 and 5},0 being rating{0,1,2,3})')
fig.show()
fig.write_html('count_binary_rating.html')

For my model, I will be using permutation test analysis to run observed values and p-value to test the model.

Null Hypothesis: Model is fair! There is no significant association between the features 'minutes', 'n_steps', and 'n_ingredients' and the binary rating 0s and 1s. 

Alternative Hypothesis: There is a significant association between the features 'minutes', 'n_steps', and 'n_ingredients' and the binary rating 0s and 1s. 

In [133]:
X = cleaned_df[['minutes', 'n_steps', 'n_ingredients']]
y = cleaned_df['binary_rating']

model = LogisticRegression()
model.fit(X, y)
observed_score = model.score(X, y)

num_permutations = 1000
permutation_scores = []

for _ in range(num_permutations):
    shuffled_labels = shuffle(y)
    model.fit(X, shuffled_labels)
    permutation_score = model.score(X, shuffled_labels)
    permutation_scores.append(permutation_score)

p_value = (np.sum(permutation_scores >= observed_score) + 1) / (num_permutations + 1)

print(f"Observed value: {observed_score}")
print(f"P-value: {p_value}")

Observed value: 0.8558818713624388
P-value: 0.000999000999000999


### Extra Code for website

In [104]:
print(cleaned_df.head().to_markdown(index=True))

|    |   minutes | tags                                                                                                                                                                                                                                                                                                                                                                                                    |   n_steps |   n_ingredients |   binary_rating |
|---:|----------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------:|----------------:|----------------:|
|  0 |        40 | ['60-minutes-or-less', 'time-to-make', 'course', 'main-ingr

In [126]:
print(binary_df.head().to_markdown(index=True))

|    |   Count |
|---:|--------:|
|  1 |   57495 |
|  0 |    9686 |
