In [49]:
from sklearn.model_selection import train_test_split
import json
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [63]:
business = []
with open("../dataset/business_all.json", "r") as file:
    for line in file:
        business.append(json.loads(line))

checkin = []
with open("../dataset/checkin_all.json", "r") as file:
    for line in file:
        checkin.append(json.loads(line))

train_review = []
with open("../dataset/review_train.json", "r") as file:
    for line in file:
        train_review.append(json.loads(line))

test_review = []
with open("../dataset/review_test.json", "r") as file:
    for line in file:
        test_review.append(json.loads(line))

users = []
with open("../dataset/user_all.json", "r") as file:
    for line in file:
        users.append(json.loads(line))
        
total_review = train_review + test_review

In [51]:
len(train_review)/(len(train_review)+len(test_review)) #8:2 split

0.7999973902491008

In [52]:
train_rev, val_rev = train_test_split(train_review, test_size=0.2, random_state=42)

In [53]:
val_rev[0]

{'votes': {'funny': 0, 'useful': 0, 'cool': 0},
 'user_id': 'CcdJ_VhU_zqe2fL7G3eXug',
 'review_id': 'aUsn69Kwsnlar7xloj7vCA',
 'stars': 4,
 'date': '2011-06-02',
 'text': "Postino's has the best brushetta around. Fresh ingredients and a marvelous wine list to go with it. It is always busy but if you go a little early for lunch it is easier to get in.  The parking can be rough but plan ahead and it is worth it.\nThis is one place with food and atmosphere to rival one of my favorite places in Florence Italy.  Don't rush, be patient and above all be open minded.  You may discover a dish you would have never considered and love it.",
 'type': 'review',
 'business_id': 'SDwYQ6eSu1htn8vHWv128g'}

In [54]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [55]:
from collections import defaultdict
import numpy as np

UserPerRestaurant = defaultdict(list)
RestaurantPerUser = defaultdict(list)
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for review in train_rev:
    u = review['user_id']
    restaurant = review['business_id']
    rate = review['stars'] 
    
    UserPerRestaurant[restaurant].append(u)
    RestaurantPerUser[u].append(restaurant)
    
    ratingsPerUser[u].append((restaurant,rate))
    ratingsPerItem[restaurant].append((u,rate))

In [56]:
restaurantCount = defaultdict(int)
totalEat = 0

for review in total_review:
    u = review['user_id']
    restaurant = review['business_id']
    rate = review['stars'] 
    restaurantCount[restaurant] +=1
    totalEat+=1

In [57]:
null_train_reviews_id = [
    review["review_id"] for review in train_rev 
    if review['text'] is None or review['text'].strip() == ''
]
print(f"Number of train null/empty reviews: {len(null_train_reviews_id)}")

null_val_reviews_id = [
    review["review_id"] for review in val_rev 
    if review['text'] is None or review['text'].strip() == ''
]
print(f"Number of val null/empty reviews: {len(null_val_reviews_id)}")

null_test_reviews_id = [
    review["review_id"] for review in test_review 
    if review['text'] is None or review['text'].strip() == ''
]
print(f"Number of test null/empty reviews: {len(null_test_reviews_id)}")

null_train_reviews_id,null_val_reviews_id,null_test_reviews_id

Number of train null/empty reviews: 4
Number of val null/empty reviews: 1
Number of test null/empty reviews: 1


(['0g3vPgL4oW7sBxZA6R2ZXQ',
  'NOhzie7YJMprviyeaEh_AQ',
  'K7fQAOToFP_pXiE7f5OLUg',
  'bY6XggJUzpO6SNUOaS3zaw'],
 ['w8LVmdgVqdSYsuPR0RoZZA'],
 ['PY-Eir6BUMMfw1u0_OipiA'])

In [155]:
users_dict = {user['user_id']: user for user in users}

X_train = []
y_train = []

total_length = 0
for review in train_rev:
    if review["review_id"] == null_train_reviews_id: continue
    total_length += sum(1 for char in review['text'] if char.isalnum())
avg_review_length = total_length / (len(train_rev) - len(null_train_reviews_id)) if train_rev else 0

for review in train_rev:
    if review["review_id"] == null_train_reviews_id: continue
    user_id = review['user_id']
    restaurant = review['business_id']
    rate = review['stars']
    #1
    user_data = users_dict.get(user_id)
    if user_data:
        review_count = user_data['review_count']
        if review_count < 50:
            avg_rating_category = 'Low'
        elif review_count < 100:
            avg_rating_category = 'Medium'
        else:
            avg_rating_category = 'High'
        avg_rating_category_numeric = {'Low': 0, 'Medium': 1, 'High': 2}[avg_rating_category]
#2
    avg_user_rating = np.mean([rate for _, rate in ratingsPerUser[user_id]]) if ratingsPerUser[user_id] else 0
    avg_restaurant_rating = np.mean([rate for _, rate in ratingsPerItem[restaurant]]) if ratingsPerItem[restaurant] else 0

    avg_review_length = sum(1 for char in review['text'] if char.isalnum())

    features = np.array([ avg_review_length, avg_restaurant_rating,avg_rating_category_numeric ])
    X_train.append(features)
    y_train.append(rate)


In [156]:
users_dict = {user['user_id']: user for user in users}

X_val = []
y_val = []

total_length = 0
for review in val_rev:
    if review["review_id"] == null_val_reviews_id: continue
    total_length += sum(1 for char in review['text'] if char.isalnum())
avg_review_length = total_length / (len(val_rev) - len(null_val_reviews_id)) if val_rev else 0

for val_review in val_rev:
    if val_review["review_id"] == null_val_reviews_id: continue
    val_user_id = val_review['user_id']
    val_restaurant = val_review['business_id']
    val_rate = val_review['stars']

    user_data = users_dict.get(val_user_id)
    if user_data:
        review_count = user_data['review_count']

        if review_count < 50:
            avg_rating_category = 'Low'
        elif review_count < 100:
            avg_rating_category = 'Medium'
        else:
            avg_rating_category = 'High'
        
        avg_rating_category_numeric = {'Low': 0, 'Medium': 1, 'High': 2}[avg_rating_category]
        avg_user_rating = np.mean([rate for _, rate in ratingsPerUser[val_user_id]]) if ratingsPerUser[val_user_id] else 0
        avg_restaurant_rating = np.mean([rate for _, rate in ratingsPerItem[val_restaurant]]) if ratingsPerItem[val_restaurant] else 0

        avg_review_length = sum(1 for char in val_review['text'] if char.isalnum())
    
        val_features = np.array([avg_review_length, avg_restaurant_rating, avg_rating_category_numeric])
        X_val.append(val_features)
        y_val.append(val_rate)


### Linear Regression 

In [157]:
theta, residuals, rank, s = np.linalg.lstsq(X_train, y_train)

  theta, residuals, rank, s = np.linalg.lstsq(X_train, y_train)


In [158]:
theta

array([-2.79453557e-04,  1.03603500e+00,  1.66433132e-02])

In [159]:
y_pred = np.matmul(X_val, theta)

for i in range(len(y_pred)):
    if y_pred[i]  <1:
        y_pred[i] = 1
    if y_pred[i] > 5:
        y_pred[i] = 5
        
mse = np.mean((y_val-y_pred)**2)

In [160]:
print("validation MSE: ", mse)

validation MSE:  1.3904127025277067


In [166]:
X_test = []
y_test = []

total_length = 0
for review in test_review:
    if review["review_id"] == null_test_reviews_id: continue
    total_length += sum(1 for char in review['text'] if char.isalnum())
avg_review_length = total_length / (len(test_review) - len(null_test_reviews_id)) if test_review else 0

for review in test_review:
    if review["review_id"] == null_test_reviews_id: continue
    test_user_id = review['user_id']
    test_restaurant = review['business_id']
    test_rate = review['stars']

    user_data = users_dict.get(test_user_id)
    if user_data:
        review_count = user_data['review_count']

        if review_count < 50:
            avg_rating_category = 'Low'
        elif review_count < 100:
            avg_rating_category = 'Medium'
        else:
            avg_rating_category = 'High'
        
        avg_rating_category_numeric = {'Low': 0, 'Medium': 1, 'High': 2}[avg_rating_category]
        avg_user_rating = np.mean([rate for _, rate in ratingsPerUser[test_user_id]]) if ratingsPerUser[test_user_id] else 0
        avg_restaurant_rating = np.mean([rate for _, rate in ratingsPerItem[test_restaurant]]) if ratingsPerItem[test_restaurant] else 0

        avg_review_length = sum(1 for char in review['text'] if char.isalnum())
    
        test_features = np.array([avg_review_length, avg_restaurant_rating, avg_rating_category_numeric])
        X_test.append(test_features)
        y_test.append(test_rate)


In [167]:
y_pred = np.matmul(X_test, theta)

for i in range(len(y_pred)):
    if y_pred[i]  <1:
        y_pred[i] = 1
    if y_pred[i] > 5:
        y_pred[i] = 5
        
mse = np.mean((y_test-y_pred)**2)
print("test MSE: ", mse)

test MSE:  1.3933411621857406
