In [None]:
#Task2

import pandas as pd

#load dataset
file_path = "recipes.csv"
df = pd.read_csv(file_path)

#data processing
#------------------- identifies missing values -------------------
#through testing the only missing values were " "
#for scaleablitiy "" was added to catch new data records that may use ""
def show_records_with_missing_values(df):
    missing_records = df[((df == " ")|(df == "")).any(axis=1)]  # Filter rows with any missing values
    print("Records with missing values:\n", missing_records)

#number of missing values per attribute
def check_missing_values():
    missing_values = ((df == " ")|(df == "")).sum()
    print("Missing values per column (empty strings and whitespace):\n", missing_values)

#treats missing values
def handle_missing_values(df):
    df.replace("", "N/A", inplace=True)
    df.replace(" ", "N/A", inplace=True)
    return df

print(df.describe())

In [None]:
#Fahds Work

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# 1. Load the data
df = pd.read_csv('recipes.csv')

# Identify and treat missing values
df.dropna(inplace=True)  # Remove rows with missing values

# Show summary statistics
print(df.describe())

# Show the 10 highest-rated recipes
highest_rated = df.sort_values(by='rating_avg', ascending=False).head(10)
print(highest_rated[['title', 'rating_avg']])

# 2. Calculate the average ratings for each recipe
average_ratings = df.groupby('title')['rating_avg'].mean()
highest_average_ratings = average_ratings.sort_values(ascending=False).head(10)
print(highest_average_ratings)

# Bootstrapping 95% confidence interval
bootstrap_samples = []
for _ in range(1000):
    sample = df.sample(n=100, replace=True)['rating_avg']
    bootstrap_samples.append(sample.mean())

ci_lower = np.percentile(bootstrap_samples, 2.5)
ci_upper = np.percentile(bootstrap_samples, 97.5)
print(f"95% Confidence Interval: ({ci_lower}, {ci_upper})")

# 3. Visualizing the relationship between rating_avg and rating_val
plt.scatter(df['rating_val'], df['rating_avg'], alpha=0.5)
plt.xlabel('Number of Ratings')
plt.ylabel('Average Rating')
plt.title('Ratings vs. Number of Ratings')
plt.show()

# Suggesting a threshold for significance
print(df['rating_val'].describe())  # Check distribution
# Example threshold: Only consider recipes with at least 10 ratings

# 4a. Combine selected features into a single string
df['combine_features'] = df[['title', 'rating_avg', 'rating_val', 'total_time', 'category', 'cuisine', 'ingredients']].astype(str).agg(' '.join, axis=1)

# 4b. Compute cosine similarity
vectorizer = CountVectorizer()
feature_matrix = vectorizer.fit_transform(df['combine_features'])
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

# 4c. Get recommendations for 'Chicken and coconut curry'
def get_recommendations(recipe_title):
    idx = df[df['title'] == recipe_title].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommended_indices = [i[0] for i in similarity_scores[1:11]]
    return df.iloc[recommended_indices][['title']]

print(get_recommendations('Chicken and coconut curry'))