In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LinearRegression, LassoLarsCV, BayesianRidge, SGDClassifier, SGDRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, accuracy_score,mean_squared_error,r2_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import r2_score, mean_squared_error 
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint,uniform

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV

ModuleNotFoundError: No module named 'xgboost'

In [None]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print(f"RMSE:{rmse}")
    return accuracy

In [None]:
train_path = '/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv'
test_path = '/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv'

# **Loading the Data**

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# **Exploratory Data Analysis**

> ## Let's have a look at the data

In [None]:
print("Train Data:", df_train.shape)
print("Test Data:", df_test.shape)
print()
print(df_train.info())
print(df_test.info())
print()
df_train.head()

## Key Statistics

In [None]:
df_train.describe().transpose()

## **Observations**
* The Data has 14 Features and the Target variable is 'Rating'
* There are 5 categorical features and 9 numerical features
* 'CreationTimestamp' will be converted into datetime object and will be used to extract further info such as 'Hour'
* Rating has labels 0-5

### **Checking for Missing Values**

In [None]:
nan_mask = df_train.isna()

# Print the positions of NaN values
for column in nan_mask.columns:
    nan_positions = nan_mask[column][nan_mask[column]].index
    for position in nan_positions:
        print(f"NaN value found at row {position} in column {column}")

In [None]:
nan_mask_test = df_test.isna()

# Print the positions of NaN values
for column in nan_mask_test.columns:
    nan_positions = nan_mask_test[column][nan_mask_test[column]].index
    for position in nan_positions:
        print(f"NaN value found at row {position} in column {column}")

In [None]:
df_train['Rating'].value_counts()

## **Data Visualization**

In [None]:
df_train.hist(figsize=(28,22))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cols = [col for col in df_train.columns if col not in ['Rating', 'ReplyCount'] and df_train[col].dtype in ['int64', 'float64']]
num_rows = len(cols) // 2 + len(cols) % 2
num_cols = min(2, len(cols))  # Ensure not to create more subplots than the number of columns

fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 4 * num_rows))

axes = axes.flatten()

for i, col_name in enumerate(cols):
    sns.boxplot(data=df_train, x='Rating', y=col_name, ax=axes[i], flierprops=dict(marker='o', markersize=4))
    axes[i].set_title(f'{col_name} distribution for each Rating')
    axes[i].set_xlabel('Rating')
    axes[i].set_ylabel(col_name)

# Remove empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

df = df_train.copy()
df['Hour'] = pd.to_datetime(df['CreationTimestamp'], unit='s').dt.hour + 1  # Add 1 for 1-based indexing

night_hours = range(0, 5)
morning_hours = range(4, 12)
afternoon_hours = range(11, 17)
evening_hours = range(16, 24)


time_periods = {
    'Morning': morning_hours,
    'Afternoon': afternoon_hours,
    'Evening': evening_hours,
    'Night': night_hours
}
cols = ['Rating']

for col in cols:
    plt.figure(figsize=(12, 8))
    for period_name, hours in time_periods.items():
        period_data = df[df['Hour'].isin(hours)]
        avg_data = period_data.groupby('Hour')[col].mean()
        plt.plot(avg_data.index, avg_data.values, label=period_name)

    plt.title(f'Time Series Plot: Average {col} by Time Period')
    plt.xlabel('Hour of Day')
    plt.ylabel(col)
    plt.grid(True)
    plt.legend()
    plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for readability
    plt.show()


In [None]:
numerical_cols = [col for col in df_train.columns if col != 'Rating' and df_train[col].dtype in ['int64', 'float64']]

# Create a color-coded heatmap with annotations
plt.figure(figsize=(12, 8))
sns.heatmap(df_train[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix (Numerical Features)', fontsize=16)
plt.xlabel('Features', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)
plt.tight_layout()
plt.show()

# CHECKING FOR ANY CORRELATION BETWEEN THE NUMERICAL FEATURES

## **Observations**
* No significant correlation between categorical variables
* Huge spike in Rating during Morning
* The majority of data points appear to cluster around Ratings 0 and 5.
* There's a severe class imbalance in the Target Variable 


# Data Preprocessing

> ## Feature Engineering/Selection

In [None]:
duplicate_rows = df_train[df_train.duplicated('Recipe_Review')]
duplicate_rows.head()

In [None]:
# DROPPING DUPLICATE VALUES FROM THE TRAIN SET BASED ON RECIPE_REVIEW
df_train.drop_duplicates('Recipe_Review', inplace=True)
df_train.reset_index(drop=True, inplace=True)

In [None]:
# DROPPING CATEGORICAL AND UNECESSARY COLUMNS, ROWS WITH NAN VALUES
coldrop = ['UserID', 'CommentID', 'RecipeName', 'UserName']
df_train.dropna(inplace=True)
df_train1 = df_train.drop(coldrop, axis=1)
df_test1 = df_test.drop(coldrop, axis=1)

In [None]:
# CONVERTING CREATIONTIMESTAMP TO HOUR OF THE DAY
df_train1['Hour'] = pd.to_datetime(df_train1['CreationTimestamp'], unit='s').dt.hour + 1
df_test1['Hour'] = pd.to_datetime(df_test1['CreationTimestamp'], unit='s').dt.hour + 1

> ### **Feature Engineering on Recipe_Review Column to Extract Info**

In [None]:
emoticons = [':)', ':D', ':-)', ':-D', '(:', 'D:', ':(', '):', ':/', ':\\']
emoticons_pattern = '|'.join(map(re.escape, emoticons))
df_train1['Emoticons_Count'] = df_train1['Recipe_Review'].str.count(emoticons_pattern)
df_test1['Emoticons_Count'] = df_test1['Recipe_Review'].str.count(emoticons_pattern)

df_train1['Exclamation_Count'] = df_train1['Recipe_Review'].str.count('!')
df_test1['Exclamation_Count'] = df_test1['Recipe_Review'].str.count('!')

df_train1['Exclamation_Count'] = df_train1['Recipe_Review'].str.count('\?')
df_test1['Exclamation_Count'] = df_test1['Recipe_Review'].str.count('\?')

df_train1['Capital_Letters_Count'] = df_train1['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
df_test1['Capital_Letters_Count'] = df_test1['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))

df_train1['Lower_Case_Count'] = df_train1['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.islower()))
df_test1['Lower_Case_Count'] = df_test1['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.islower()))


In [None]:
df_train2 = df_train1.copy()
df_test2 = df_test1.copy()

In [None]:
def preprocess_recipe_review(text):
    # Remove HTML tags, URLs, and email addresses
    text = re.sub('<.*?>|http\S+|www\S+|\S*@\S*\s?', '', str(text))

    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters, extra spaces, and leading/trailing spaces
    text = re.sub('[^a-zA-Z\']', ' ', text)
    text = re.sub('\s+', ' ', text).strip()

    return text

# Apply preprocessing to the 'Recipe_Review' column in all datasets
datasets = [df_train1, df_test1]

for dataset in datasets:
    dataset['Recipe_Review'] = dataset['Recipe_Review'].astype(str).apply(preprocess_recipe_review)

# Fill missing or empty 'Recipe_Review' values based on the most frequent values within each 'Rating' group
def fill_missing_reviews(df, most_frequent_values):
    return df.apply(lambda row: most_frequent_values[row['Rating']] 
                    if row['Recipe_Review'] in ['', 'nan'] 
                    else row['Recipe_Review'], axis=1)

most_frequent_values = df_train2.groupby('Rating')['Recipe_Review'].agg(lambda x: x.mode().iloc[0])

df_train2['Recipe_Review'] = fill_missing_reviews(df_train2, most_frequent_values)

## **Feature Engineering on Remaining Features**

In [None]:
# ADDING 1 FOR FURTHER DIVISION LATER WHILE FEATURE ENGINEERING
columns_to_increment = ['ThumbsUpCount', 'ThumbsDownCount', 'UserReputation', 'BestScore', 'ReplyCount']
df_train2[columns_to_increment] = df_train2[columns_to_increment].apply(lambda x: x + 1)
df_test2[columns_to_increment] = df_test2[columns_to_increment].apply(lambda x: x + 1)

In [None]:
df_train2['NormalizedRecipeNumber'] = 1 - (df_train2['RecipeNumber'] - df_train2['RecipeNumber'].min()) / (df_train2['RecipeNumber'].max() - df_train2['RecipeNumber'].min())
df_test2['NormalizedRecipeNumber'] = 1 - (df_test2['RecipeNumber'] - df_test2['RecipeNumber'].min()) / (df_test2['RecipeNumber'].max() - df_test2['RecipeNumber'].min())


In [None]:
def create_time_of_day_influence(df):
    conditions = [
        (df['Hour'] >= 17) & (df['Hour'] < 24),
        (df['Hour'] >= 0) & (df['Hour'] < 5),
        (df['Hour'] >= 12) & (df['Hour'] < 17),
        (df['Hour'] >= 5) & (df['Hour'] < 12)
    ]

    values = [0.8, 1, 1, 0.4]

    df['TimeOfDay_Influence'] = np.select(conditions, values, default=0)

create_time_of_day_influence(df_train2)
create_time_of_day_influence(df_test2)

In [None]:
Average_UserReputation = df_train2['UserReputation'].mean()
Average_UserReputation_test = df_test2['UserReputation'].mean()
df_train2['UserReputation_Normalized'] = df_train2['UserReputation'] / Average_UserReputation
df_test2['UserReputation_Normalized'] = df_test2['UserReputation'] / Average_UserReputation_test


In [None]:
df_train2['Weighted_SocialInfluence'] = 0.4 * df_train2['NormalizedRecipeNumber'] + 0.6 * df_train2['TimeOfDay_Influence']
df_test2['Weighted_SocialInfluence'] = 0.4 * df_test2['NormalizedRecipeNumber'] + 0.6 * df_test2['TimeOfDay_Influence']

df_train2['PositiveInteractionRatio'] = ((df_train2['TimeOfDay_Influence']) * (df_train2['ThumbsUpCount'])) / (df_train2['ThumbsUpCount'] + df_train2['ThumbsDownCount'])
df_test2['PositiveInteractionRatio'] = ((df_test2['TimeOfDay_Influence']) * (df_test2['ThumbsUpCount'])) / (df_test2['ThumbsUpCount'] + df_test2['ThumbsDownCount'])

df_train2['AdjustedSentimentPolarity'] = ((df_train2['ThumbsUpCount'] - df_train2['ThumbsDownCount']) / df_train2['ReplyCount']).apply(lambda x: min(max(x, -20), 45))
df_test2['AdjustedSentimentPolarity'] = ((df_test2['ThumbsUpCount'] - df_test2['ThumbsDownCount']) / df_test2['ReplyCount']).apply(lambda x: min(max(x, -20), 45))


In [None]:
recipe_code_count = df_train2.groupby('RecipeCode').size().reset_index(name='RecipeCode_Count')
df_train2 = pd.merge(df_train2, recipe_code_count, on='RecipeCode', how='left')
df_test2 = pd.merge(df_test2, recipe_code_count, on='RecipeCode', how='left')

In [None]:
coldrop2 = ['Hour','RecipeNumber', 'ReplyCount', 'ThumbsDownCount', 'ThumbsUpCount', 'UserReputation', 'BestScore']
df_train2.drop(coldrop2, axis=1, inplace=True)
df_test2.drop(coldrop2, axis=1, inplace=True)

## Splitting the Data into Train and Validation Set

In [None]:
train, val = train_test_split(df_train2, test_size=0.01, random_state=42)

In [None]:
vect = CountVectorizer(stop_words='english', max_features=250)

A1 = vect.fit_transform(train['Recipe_Review'])
feature_name = vect.get_feature_names_out()
train_count = pd.DataFrame(A1.toarray(), columns=feature_name)
train_count.index = train.index
train = pd.concat([train, train_count], axis=1)

A2 = vect.transform(val['Recipe_Review'])
feature_name_v = vect.get_feature_names_out()
val_count = pd.DataFrame(A2.toarray(), columns=feature_name_v)
val_count.index = val.index
val = pd.concat([val, val_count], axis=1)

A3 = vect.transform(df_test2['Recipe_Review'])
feature_name_te = vect.get_feature_names_out()
test_count = pd.DataFrame(A3.toarray(), columns=feature_name_te)
test_count.index = df_test2.index
df_test2 = pd.concat([df_test2, test_count], axis=1)

In [None]:
X_train_model = train.drop(['Rating', 'Recipe_Review'], axis=1)
y_train_model = train['Rating']

X_val_model = val.drop(['Rating', 'Recipe_Review'], axis=1)
y_val_model = val['Rating']

In [None]:
X_val_model

In [None]:
numeric_features = ['RecipeCode_Count', 'Capital_Letters_Count', 'Lower_Case_Count']       

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), numeric_features),
    ],
    remainder='passthrough'
)

> ## **Model 1**

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # You can adjust hyperparameters here
])

random_forest_pipeline.fit(X_train_model, y_train_model)

random_forest_predictions = random_forest_pipeline.predict(X_val_model)

random_forest_accuracy = accuracy_score(y_val_model, random_forest_predictions)
print("Accuracy for RandomForestClassifier:", random_forest_accuracy)

> ## Hyperparameter Tuning for Model 1

In [None]:
param_dist = {
    'classifier__n_estimators': randint(50, 300),
    'classifier__max_depth': [None, 5, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__max_features': [1, 0.5, 'sqrt', 'log2', None],
    'classifier__bootstrap': [True, False],
}
random_search = RandomizedSearchCV(random_forest_pipeline, param_distributions=param_dist, n_iter=5, cv=5, scoring='accuracy', random_state=42)

random_search.fit(X_train_model, y_train_model)

print("Best Hyperparameters:", random_search.best_params_)

tuned_random_forest_predictions = random_search.best_estimator_.predict(X_val_model)

tuned_random_forest_accuracy = accuracy_score(y_val_model, tuned_random_forest_predictions)
print("Accuracy for Tuned RandomForestClassifier:", tuned_random_forest_accuracy)

Best Hyperparameters: {'classifier__bootstrap': False, 'classifier__max_depth': 20, 'classifier__max_features': 0.5, 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 15, 'classifier__n_estimators': 207}

Accuracy for Tuned RandomForestClassifier: 0.775243081525804

> ## **Model 2**

In [None]:
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

knn_pipeline.fit(X_train_model, y_train_model)

knn_predictions = knn_pipeline.predict(X_val_model)

knn_accuracy = accuracy_score(y_val_model, knn_predictions)
print("Accuracy for KNN:", knn_accuracy)

## Hyperparameter Tuning for Model 2

In [None]:
knn_param_dist = {
    'classifier__n_neighbors': [3, 5, 8, 10, 15, 20],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'classifier__leaf_size': [10, 20, 30, 40, 50],
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
}

knn_random_search = RandomizedSearchCV(knn_pipeline, knn_param_dist, n_iter=5, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
knn_random_search.fit(X_train_model, y_train_model)

best_knn_model = knn_random_search.best_estimator_

knn_predictions = best_knn_model.predict(X_val_model)

knn_accuracy = accuracy_score(y_val_model, knn_predictions)
print("Accuracy for KNN:", knn_accuracy)

print("Best Hyperparameters for KNN:", knn_random_search.best_params_)


Best Hyperparameters for KNN: {'classifier__weights': 'uniform', 'classifier__p': 2, 'classifier__n_neighbors': 20, 'classifier__metric': 'manhattan', 'classifier__leaf_size': 10, 'classifier__algorithm': 'brute'}

Accuracy for KNN: 0.7611940298507462

> ## **Model 3**

In [None]:
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

# Fit the pipeline on training data
xgb_pipeline.fit(X_train_model, y_train_model)

# Make predictions on validation data
xgb_predictions = xgb_pipeline.predict(X_val_model)

# Calculate and print accuracy for XGBoost
xgb_accuracy = accuracy_score(y_val_model, xgb_predictions)
print("Accuracy for XGBoost:", xgb_accuracy)

## Hyperparameter Tuning for Model 3

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

xgb_param_dist = {
    'classifier__objective': ['multi:softmax'],
    'classifier__num_class': [6],
    'classifier__learning_rate': uniform(0.01, 0.2 - 0.01),
    'classifier__max_depth': randint(3, 7),
    'classifier__n_estimators': randint(50, 201),
    'classifier__gamma': uniform(0, 0.2),
    'classifier__subsample': uniform(0.8, 1.0 - 0.8),
    'classifier__colsample_bytree': uniform(0.8, 1.0 - 0.8),
    'classifier__min_child_weight': randint(1, 4),
}

xgb_random_search = RandomizedSearchCV(xgb_pipeline, xgb_param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

xgb_random_search.fit(X_train_model, y_train_model)

best_xgb_model_random = xgb_random_search.best_estimator_

xgb_predictions_random = best_xgb_model_random.predict(X_val_model)

xgb_accuracy_random = accuracy_score(y_val_model, xgb_predictions_random)

print("Accuracy for XGBoost with RandomizedSearchCV:", xgb_accuracy_random)
print("Best Hyperparameters for XGBoost (RandomizedSearchCV):", xgb_random_search.best_params_)

Accuracy for XGBoost with RandomizedSearchCV: 0.774869109947644
Best Hyperparameters for XGBoost (RandomizedSearchCV): {'classifier__colsample_bytree': 0.8749080237694725, 'classifier__gamma': 0.19014286128198324, 'classifier__learning_rate': 0.14907884894416698, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 152, 'classifier__num_class': 6, 'classifier__objective': 'multi:softmax', 'classifier__subsample': 0.8891665505707183}

# Model 4 (Final Model)

## Preprocessing for the Final Model

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
# train_df.drop_duplicates('Recipe_Review', inplace=True)
train_df = train_df.drop_duplicates(['Recipe_Review'])

train_df.reset_index(drop=True, inplace=True)

In [None]:
positive_emoticons = [':)', ':D', ':-)', ':-D', '(:', 'D:']
negative_emoticons = [':(', '):', ':/', ':\\']
emoticons = positive_emoticons + negative_emoticons
emoticons_pattern = '|'.join(map(re.escape, emoticons))
train_df['Emoticons_Count'] = train_df['Recipe_Review'].str.count(emoticons_pattern)
train_df['Exclamation_Count'] = train_df['Recipe_Review'].str.count('!')
train_df['Question_Count'] = train_df['Recipe_Review'].str.count('\?')
train_df['Capital_Letters_Count'] = train_df['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
train_df['Lower_Case_Count'] = train_df['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.islower()))

In [None]:
test_df['Emoticons_Count'] = test_df['Recipe_Review'].str.count(emoticons_pattern)
test_df['Exclamation_Count'] = test_df['Recipe_Review'].str.count('!')
test_df['Question_Count'] = test_df['Recipe_Review'].str.count('\?')
test_df['Capital_Letters_Count'] = test_df['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
test_df['Lower_Case_Count'] = test_df['Recipe_Review'].apply(lambda x: sum(1 for c in str(x) if c.islower()))

In [None]:
train_df['Hour'] = pd.to_datetime(train_df['CreationTimestamp'], unit='s').dt.hour + 1  # Add 1 for 1-based indexing
test_df['Hour'] = pd.to_datetime(test_df['CreationTimestamp'], unit='s').dt.hour + 1  # Add 1 for 1-based indexing

In [None]:
col_drop = ['RecipeName', 'UserID', 'CommentID', 'UserName']
train_df = train_df.drop(columns = col_drop)
test_df = test_df.drop(columns = col_drop)

In [None]:
X =  train_df
y = train_df['Rating']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
X_train['ThumbsUpCount'] += 1
X_train['ThumbsDownCount'] += 1
X_train['UserReputation'] += 1
X_train['BestScore'] += 1
X_train['ReplyCount'] += 1

X_val['ThumbsUpCount'] += 1
X_val['ThumbsDownCount'] += 1
X_val['UserReputation'] += 1
X_val['BestScore'] += 1
X_val['ReplyCount'] += 1

test_df['ThumbsUpCount'] += 1
test_df['ThumbsDownCount'] += 1
test_df['UserReputation'] += 1
test_df['BestScore'] += 1
test_df['ReplyCount'] += 1


In [None]:
avg_UserReputation = X_train['UserReputation'].mean()

X_train['RecipeNumber_Influence'] = 1 - (X_train['RecipeNumber'] - X_train['RecipeNumber'].min()) / (X_train['RecipeNumber'].max() - X_train['RecipeNumber'].min())
X_train['Hour_Influence'] = 0

X_train.loc[(X_train['Hour'] >= 17) & (X_train['Hour'] < 24), 'Hour_Influence'] = 0.8  # Evening
X_train.loc[(X_train['Hour'] >= 0) & (X_train['Hour'] < 5), 'Hour_Influence'] = 1  # Night
X_train.loc[(X_train['Hour'] >= 12) & (X_train['Hour'] < 17), 'Hour_Influence'] = 1  # Afternoon
X_train.loc[(X_train['Hour'] >= 5) & (X_train['Hour'] < 12), 'Hour_Influence'] = 0.4  # Morning

avg_UserReputation = X_train['UserReputation'].mean()
X_train['User_Influence'] = X_train['UserReputation'] / avg_UserReputation
X_train['Comment_Score'] =  0.4 * X_train['RecipeNumber_Influence'] + 0.6 * X_train['Hour_Influence']
X_train['Positive_Interaction'] = ((X_train['Hour']) * (X_train['ThumbsUpCount'])) / (X_train['ThumbsUpCount'] + X_train['ThumbsDownCount']) 
X_train['Polarity_Sentiment'] = ((X_train['ThumbsUpCount'] - X_train['ThumbsDownCount']) / X_train['ReplyCount']).apply(lambda x: min(max(x, -20), 45))


In [None]:
avg_UserReputation_val = X_val['UserReputation'].mean()

X_val['RecipeNumber_Influence'] = 1 - (X_val['RecipeNumber'] - X_val['RecipeNumber'].min()) / (X_val['RecipeNumber'].max() - X_val['RecipeNumber'].min())
X_val['Hour_Influence'] = 0

X_val.loc[(X_val['Hour'] >= 17) & (X_val['Hour'] < 24), 'Hour_Influence'] = 0.8  # Evening
X_val.loc[(X_val['Hour'] >= 0) & (X_val['Hour'] < 5), 'Hour_Influence'] = 1  # Night
X_val.loc[(X_val['Hour'] >= 12) & (X_val['Hour'] < 17), 'Hour_Influence'] = 1  # Afternoon
X_val.loc[(X_val['Hour'] >= 5) & (X_val['Hour'] < 12), 'Hour_Influence'] = 0.4  # Morning

X_val['User_Influence'] = X_val['UserReputation'] / avg_UserReputation_val
X_val['Comment_Score'] =  0.4 * X_val['RecipeNumber_Influence'] + 0.6 * X_val['Hour_Influence']
X_val['Positive_Interaction'] = ((X_val['Hour']) * (X_val['ThumbsUpCount'])) / (X_val['ThumbsUpCount'] + X_val['ThumbsDownCount']) 
X_val['Polarity_Sentiment'] = ((X_val['ThumbsUpCount'] - X_val['ThumbsDownCount']) / X_val['ReplyCount']).apply(lambda x: min(max(x, -20), 45))

In [None]:
avg_UserReputation_test = test_df['UserReputation'].mean()

test_df['RecipeNumber_Influence'] = 1 - (test_df['RecipeNumber'] - test_df['RecipeNumber'].min()) / (test_df['RecipeNumber'].max() - test_df['RecipeNumber'].min())
test_df['Hour_Influence'] = 0

test_df.loc[(test_df['Hour'] >= 17) & (test_df['Hour'] < 24), 'Hour_Influence'] = 0.8  # Evening
test_df.loc[(test_df['Hour'] >= 0) & (test_df['Hour'] < 5), 'Hour_Influence'] = 1  # Night
test_df.loc[(test_df['Hour'] >= 12) & (test_df['Hour'] < 17), 'Hour_Influence'] = 1  # Afternoon
test_df.loc[(test_df['Hour'] >= 5) & (test_df['Hour'] < 12), 'Hour_Influence'] = 0.4  # Morning

test_df['User_Influence'] = test_df['UserReputation'] / avg_UserReputation_test
test_df['Comment_Score'] =  0.4 * test_df['RecipeNumber_Influence'] + 0.6 * test_df['Hour_Influence']
test_df['Positive_Interaction'] = ((test_df['Hour']) * (test_df['ThumbsUpCount'])) / (test_df['ThumbsUpCount'] + test_df['ThumbsDownCount']) 
test_df['Polarity_Sentiment'] = ((test_df['ThumbsUpCount'] - test_df['ThumbsDownCount']) / test_df['ReplyCount']).apply(lambda x: min(max(x, -20), 45))


In [None]:
recipecode_frequency = X.groupby('RecipeCode').size().reset_index(name='RecipeCode_Frequency')
X_train = pd.merge(X_train, recipecode_frequency, on='RecipeCode', how='left')
X_val = pd.merge(X_val, recipecode_frequency, on='RecipeCode', how='left')
test_df = pd.merge(test_df, recipecode_frequency, on='RecipeCode', how='left')

### Text Preprocessing for Recipe_Review Column

In [None]:
def preprocess_recipe_review(text):
    # Remove HTML tags
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Convert to string type
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+$', '', text)  # Remove trailing spaces
    text = re.sub(r'^\s+', '', text)  # Remove leading spaces
    
    return text

In [None]:
X_train['Recipe_Review'] = X_train['Recipe_Review'].values.astype('U')
X_val['Recipe_Review'] = X_val['Recipe_Review'].values.astype('U')
test_df['Recipe_Review'] = test_df['Recipe_Review'].values.astype('U')
X_train['Recipe_Review'] = X_train['Recipe_Review'].apply(preprocess_recipe_review)
X_val['Recipe_Review'] = X_val['Recipe_Review'].apply(preprocess_recipe_review)
test_df['Recipe_Review'] = test_df['Recipe_Review'].apply(preprocess_recipe_review)
most_frequent_values_t = X_train.groupby('Rating')['Recipe_Review'].agg(lambda x: x.mode().iloc[0])
X_train['Recipe_Review'] = X_train.apply(lambda row: most_frequent_values_t[row['Rating']] if row['Recipe_Review']=='' or row['Recipe_Review']=='nan' else row['Recipe_Review'], axis=1)
most_frequent_values_v = X_val.groupby('Rating')['Recipe_Review'].agg(lambda x: x.mode().iloc[0])
X_val['Recipe_Review'] = X_val.apply(lambda row: most_frequent_values_v[row['Rating']] if row['Recipe_Review']=='' or row['Recipe_Review']=='nan' else row['Recipe_Review'], axis=1)

In [None]:
vectorizer = CountVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(X_train['Recipe_Review'])
feature_names = vectorizer.get_feature_names_out()

count_train = pd.DataFrame(X1.toarray(), columns=feature_names)
count_train.index = X_train.index
combined_train = pd.concat([X_train, count_train], axis=1)

In [None]:
X2 = vectorizer.transform(X_val['Recipe_Review'])

feature_names_v = vectorizer.get_feature_names_out()

count_val = pd.DataFrame(X2.toarray(), columns=feature_names_v)
count_val.index = X_val.index
combined_val = pd.concat([X_val, count_val], axis=1)

In [None]:
X3 = vectorizer.transform(test_df['Recipe_Review'])
feature_names_te = vectorizer.get_feature_names_out()

count_test = pd.DataFrame(X3.toarray(), columns=feature_names_te)
count_test.index = test_df.index
combined_test = pd.concat([test_df, count_test], axis=1)

In [None]:
col = ['Hour','RecipeNumber','Recipe_Review', 'ReplyCount', 'ThumbsDownCount', 'ThumbsUpCount', 'UserReputation', 'BestScore']
train = combined_train.drop(col, axis=1)
val = combined_val.drop(col, axis=1)
test = combined_test.drop(col, axis=1)

In [None]:
X_train1 = train.drop('Rating', axis=1)
y_train1 = train['Rating']
X_val1 = val.drop('Rating', axis=1)
y_val1 = val['Rating']

In [None]:
lgbm = LGBMClassifier(verbosity=-1)
# lgbm = LGBMClassifier()

lgbm.fit(X_train1, y_train1)
y_pred = lgbm.predict(X_val1)
accuracy = accuracy_score(y_val1, y_pred)
accuracy

## Hyperparameter Tuning for the Final Model

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint
# Define the LightGBM model
lgbm = LGBMClassifier()

param_grid = {
    'objective': ['multiclass'],
    'num_class': [6],
    'is_unbalance': [True],
    'boosting_type': ['gbdt'],
    'learning_rate': uniform(0.01, 0.1),  
    'num_leaves': randint(50, 150), 
    'max_depth': randint(10, 30),  
    'min_child_samples': randint(20, 50), 
    'subsample': uniform(0.5, 1.0),
    'colsample_bytree': uniform(0.5, 1.0),  
    'n_estimators': randint(100, 1000),  
    'feature_fraction': uniform(0.0, 0.9), 
    'bagging_fraction' : uniform(0.0, 0.95),
    'min_child_weight': uniform(1.0, 10.0),  
    'reg_alpha': uniform(0.0, 1.0),  
    'reg_lambda': uniform(0.0, 1.0), 
    'random_state': [42],
    'verbosity': [-1]
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    lgbm, param_distributions=param_grid, n_iter=10, scoring='accuracy', cv=5, random_state=42, verbose=0, n_jobs=-1
)

# Fit the RandomizedSearchCV object to your training data
random_search.fit(X_train1, y_train1)

# Get the best parameters and model from the RandomizedSearchCV
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Predict on the validation set using the best model
y_pred = best_model.predict(X_val1)

# Calculate accuracy
accuracy = accuracy_score(y_val1, y_pred)
print("Best Parameters:", best_params)

Best Parameters: {'bagging_fraction': 0.7759991820225434, 'boosting_type': 'gbdt', 'colsample_bytree': 0.7962735057040824, 'feature_fraction': 0.26526693906300247, 'is_unbalance': True, 'learning_rate': 0.011563640674119394, 'max_depth': 18, 'min_child_samples': 43, 'min_child_weight': 4.948815181755697, 'n_estimators': 747, 'num_class': 6, 'num_leaves': 137, 'objective': 'multiclass', 'random_state': 42, 'reg_alpha': 0.014079822715084456, 'reg_lambda': 0.19884240408880516, 'subsample': 1.21134195274865, 'verbosity': -1}

In [None]:
best = {'objective': 'multiclass','num_class': 6, 'is_unbalance':True,'boosting_type': 'gbdt','learning_rate': 0.018759635417952426, 'num_leaves': 123, 'max_depth': 23, 'min_child_samples': 38, 'subsample': 0.6098784490710742, 'colsample_bytree': 0.6063993010775044, 'n_estimators': 582, 'feature_fraction': 0.5093704858195883, 'bagging_fraction': 0.9312328232910088, 'min_child_weight': 6.250417984097327, 'reg_alpha': 0.07960897021051773, 'reg_lambda': 0.605631294303396, 'random_state':42, 'verbosity':-1}

In [None]:
lgbm = LGBMClassifier(**best)

lgbm.fit(X_train1, y_train1)
y_pred = lgbm.predict(X_val1)
accuracy = accuracy_score(y_val1, y_pred)
accuracy

In [None]:
prediction = lgbm.predict(test)
print(prediction)

In [None]:
predictions_df = pd.DataFrame({'ID': range(1, len(prediction) + 1), 'Rating': prediction})
predictions_df.to_csv('submission.csv', index=False)

> # **Model Comparison**
- LGBM outperformed other models, likely due to its effective handling of imbalanced datasets and ability to capture complex relationships.
- RandomForest, though a strong ensemble method, may not adapt as well to imbalances compared to boosting algorithms like LGBM and XGBoost.
- XGBoost, while a powerful boosting algorithm, performed slightly below LGBM, suggesting that the specific characteristics of the dataset favored LGBM.
- KNN, being a non-parametric method relying on distances, struggled in the presence of imbalances and high-dimensional data, leading to its lower performance.
- Ensemble methods (LGBM, RandomForest, XGBoost) generally outperformed the individual model (KNN), highlighting the importance of model complexity and adaptability in handling imbalanced datasets.
- Efficient default settings and computational efficiency played a role, with LGBM and XGBoost being more forgiving and faster in processing large datasets compared to RandomForest and KNN.