In [91]:
from sklearn.model_selection import train_test_split
import json
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [12]:
business = []
with open("../dataset/business_all.json", "r") as file:
    for line in file:
        business.append(json.loads(line))

checkin = []
with open("../dataset/checkin_all.json", "r") as file:
    for line in file:
        checkin.append(json.loads(line))

train_review = []
with open("../dataset/review_train.json", "r") as file:
    for line in file:
        train_review.append(json.loads(line))

test_review = []
with open("../dataset/review_test.json", "r") as file:
    for line in file:
        test_review.append(json.loads(line))

user = []
with open("../dataset/user_all.json", "r") as file:
    for line in file:
        user.append(json.loads(line))
        
total_review = train_review + test_review

In [6]:
len(train_review)/(len(train_review)+len(test_review)) #8:2 split

0.7999973902491008

In [18]:
train_rev, val_rev = train_test_split(train_review, test_size=0.2, random_state=42)

In [19]:
val_rev[0]

{'votes': {'funny': 0, 'useful': 0, 'cool': 0},
 'user_id': 'CcdJ_VhU_zqe2fL7G3eXug',
 'review_id': 'aUsn69Kwsnlar7xloj7vCA',
 'stars': 4,
 'date': '2011-06-02',
 'text': "Postino's has the best brushetta around. Fresh ingredients and a marvelous wine list to go with it. It is always busy but if you go a little early for lunch it is easier to get in.  The parking can be rough but plan ahead and it is worth it.\nThis is one place with food and atmosphere to rival one of my favorite places in Florence Italy.  Don't rush, be patient and above all be open minded.  You may discover a dish you would have never considered and love it.",
 'type': 'review',
 'business_id': 'SDwYQ6eSu1htn8vHWv128g'}

In [248]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [247]:
from collections import defaultdict
import numpy as np

UserPerRestaurant = defaultdict(list)
RestaurantPerUser = defaultdict(list)
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for review in train_rev:
    user = review['user_id']
    restaurant = review['business_id']
    rate = review['stars'] 
    
    UserPerRestaurant[restaurant].append(user)
    RestaurantPerUser[user].append(restaurant)
    
    ratingsPerUser[user].append((restaurant,rate))
    ratingsPerItem[restaurant].append((user,rate))

In [249]:
restaurantCount = defaultdict(int)
totalEat = 0

for review in total_review:
    user = review['user_id']
    restaurant = review['business_id']
    rate = review['stars'] 
    restaurantCount[restaurant] +=1
    totalEat+=1

In [364]:
X_train = []
y_train = []

for review in train_rev:
    user = review['user_id']
    restaurant = review['business_id']
    rate = review['stars']  # This is the rating (1, 2, 3, 4, or 5)
    
    # Extract features (normalized popularity, review counts, average ratings, etc.)
    popularity_score = restaurantCount[restaurant] / totalEat if restaurant in restaurantCount else 0
    user_review_count = len(RestaurantPerUser[user])
    avg_user_rating = np.mean([rate for _, rate in ratingsPerUser[user]]) if ratingsPerUser[user] else 0
    restaurant_review_count = len(UserPerRestaurant[restaurant])
    avg_restaurant_rating = np.mean([rate for _, rate in ratingsPerItem[restaurant]]) if ratingsPerItem[restaurant] else 0
    
    
    
    #features = np.array([popularity_score, user_review_count, avg_user_rating, restaurant_review_count, avg_restaurant_rating])
    features = np.array([avg_restaurant_rating,restaurant_review_count,popularity_score])
    X_train.append(features)
    y_train.append(rate)  


In [365]:
X_val = []
y_val = []

for val_review in val_rev:
    val_user = val_review['user_id']
    val_restaurant = val_review['business_id']
    val_rate = val_review['stars']  

    # Feature 1: normalized Popularity
    popularity_score = restaurantCount[val_restaurant] / totalEat if val_restaurant in restaurantCount else 0

    # Feature 2: User Review Count (how many reviews the user has written)
    user_review_count = len(RestaurantPerUser[val_user])

    # Feature 3: Average User Rating (average rating given by this user)
    avg_user_rating = np.mean([rate for _, rate in ratingsPerUser[val_user]]) if ratingsPerUser[val_user] else 0

    # Feature 4: Restaurant Review Count (how many reviews this restaurant has)
    restaurant_review_count = len(UserPerRestaurant[val_restaurant])

    # Feature 5: Restaurant Average Rating (average rating for this restaurant)
    avg_restaurant_rating = np.mean([rate for _, rate in ratingsPerItem[val_restaurant]]) if ratingsPerItem[val_restaurant] else 0
    
   # val_features = np.array([popularity_score, user_review_count, avg_user_rating, restaurant_review_count, avg_restaurant_rating])
    val_features = np.array([avg_restaurant_rating,restaurant_review_count,popularity_score])
    X_val.append(val_features)
    y_val.append(val_rate)



# total_pred = 0
# correct_pred = 0
# for true, pred in zip(y_val, predicted_classes):
#     if true == pred:
#         correct_pred +=1
#     total_pred += 1

# accuracy = correct_pred/float(total_pred) 


In [366]:
features

array([3.60714286e+00, 2.80000000e+01, 1.65284224e-04])

### Linear Regression 

In [367]:
theta, residuals, rank, s = np.linalg.lstsq(X_train, y_train)

# Unpack the result
#theta0, theta1, theta2= theta

  theta, residuals, rank, s = np.linalg.lstsq(X_train, y_train)


In [368]:
y_pred = np.matmul(X_val, theta)

for i in range(len(y_pred)):
    if y_pred[i]  <1:
        y_pred[i] = 1
    if y_pred[i] > 5:
        y_pred[i] = 5
        
mse = np.mean((y_val-y_pred)**2)

In [369]:
mse

1.4064451825596227

In [357]:
y_pred

array([4.50546448, 4.2       , 3.96969697, ..., 3.83333333, 3.73195876,
       3.52272727])