In [3]:
import json
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
from datetime import datetime

In [4]:
business = []
with open("../dataset/business_all.json", "r") as file:
    for line in file:
        business.append(json.loads(line))

checkin = []
with open("../dataset/checkin_all.json", "r") as file:
    for line in file:
        checkin.append(json.loads(line))

train_review = []
with open("../dataset/review_train.json", "r") as file:
    for line in file:
        train_review.append(json.loads(line))

test_review = []
with open("../dataset/review_test.json", "r") as file:
    for line in file:
        test_review.append(json.loads(line))

user = []
with open("../dataset/user_all.json", "r") as file:
    for line in file:
        user.append(json.loads(line))
        

In [5]:
### baseline
train_average_stars = sum(d["stars"] for d in train_review) / len(train_review)
mse = sum((d["stars"] - train_average_stars) ** 2 for d in test_review) / len(test_review)
print(mse)

1.490833578439748


In [6]:
def prepare_data(business_data, user_data, train_review_data, test_review_data):
    # Create dictionaries for fast lookup
    business_dict = {b["business_id"]: b for b in business_data}
    user_dict = {u["user_id"]: u for u in user_data}

    # Initialize lists for features (X) and target variable (y)
    X_train = []  # Features for training data
    y_train = []  # Target variable for training data
    X_test = []   # Features for testing data
    y_test = []   # Target variable for testing data

    # Extract all unique categories for encoding
    all_categories = set(
        category for business in business_data for category in business["categories"]
    )
    print("AAA")
    print(len(all_categories))
    mlb = MultiLabelBinarizer(classes=list(all_categories))  # MultiLabelBinarizer for categories

    # Preprocess business categories into a one-hot encoded matrix
    category_features = mlb.fit_transform([business["categories"] for business in business_data])

    # Extract all unique states for encoding
    all_states = list(set(business["state"] for business in business_data))
    state_encoder = OneHotEncoder(categories=[all_states], sparse=False)

    # Preprocess business states into a one-hot encoded matrix
    state_features = state_encoder.fit_transform([[business["state"]] for business in business_data])

    
    # Helper function to extract features for reviews
    def extract_features_and_target(review_data, category_features, state_features):
        X = []
        y = []
        
        for review in review_data:
            business_id = review["business_id"]
            user_id = review["user_id"]
            review_date = datetime.strptime(review["date"], "%Y-%m-%d").timestamp()  # Convert date to timestamp
            review_stars = review["stars"]

            # Find corresponding business data
            business = business_dict.get(business_id, None)

            # If business data is not found, use default values
            if not business:
                business_review_count = 0
                business_latitude = 0.0
                business_longitude = 0.0
                business_category_vector = np.zeros(category_features.shape[1])  # Empty category vector
                business_state_vector = np.zeros(state_features.shape[1])  # Empty state vector
            else:
                business_review_count = business["review_count"]
                business_latitude = business["latitude"]
                business_longitude = business["longitude"]

                # Find the index of the business to fetch category and state features
                business_index = list(business_dict.keys()).index(business_id)

                # Get the category and state one-hot encoded vectors
                business_category_vector = category_features[business_index]
                business_state_vector = state_features[business_index]

            # Find corresponding user data
            user = user_dict.get(user_id, None)  # Fast lookup in the user dictionary
            if user:
                user_avg_stars = user["average_stars"]
                user_review_count = user["review_count"]
            else:
                user_avg_stars = 3.766723066283323  # Default value if user not found
                user_review_count = 0  # Default value if user not found

            # Combine all features
            feature_vector = [
                business_review_count, 
                business_latitude, 
                business_longitude, 
                review_date, 
                user_avg_stars, 
                user_review_count,
            ] + list(business_category_vector) + list(business_state_vector)  # Append category and state features

            X.append(feature_vector)
            y.append(review_stars)  # Target variable (stars)

        return np.array(X), np.array(y)

    # Process training data
    X_train, y_train = extract_features_and_target(train_review_data, category_features, state_features)

    # Process test data
    X_test, y_test = extract_features_and_target(test_review_data, category_features, state_features)

    return X_train, y_train, X_test, y_test

In [7]:
# Step 1: Prepare the training and testing data
X_train, y_train, X_test, y_test = prepare_data(business, user, train_review, test_review)

AAA
508


In [8]:
# Step 2: Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
mse = mean_squared_error(y_test, y_pred)

# Print the evaluation results
print("Mean Squared Error (MSE):", mse)

# Step 5: Inspect the model's coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Mean Squared Error (MSE): 1.1214726693301182
Coefficients: [ 9.25482877e-04  5.93643485e-02  7.69291666e-02  2.31613534e-10
  9.28223523e-01  2.15478258e-06  4.77177457e-01  2.70092324e-01
 -1.23555256e-01 -8.60364177e-02 -1.34025630e-02  5.65927325e-02
 -4.51155872e-01  4.73409745e-01  3.77672474e-01 -2.82978359e-01
 -2.26009165e-01  5.60965315e-01  1.11248745e-02 -5.99443212e-02
  8.06845812e-01  2.11550443e-01 -3.06523263e-01 -1.89457703e-01
 -3.65623343e-01  2.53741556e-01  1.59013194e-01  1.20744354e-01
  2.08794646e-01 -1.81004096e-01  1.87081182e-01  1.09313017e-01
 -2.43021880e-01 -2.36787547e-01 -3.74486422e-02  2.58628653e-01
  1.66631438e-01 -2.88739068e-01 -1.99371951e-02  2.03821885e-01
 -4.69956387e-02 -9.70272194e-04  2.65840623e-01  2.14477520e-01
  1.10395600e-02  3.13087481e-01  8.29362063e-01  9.33428615e-01
  5.47506630e-02 -5.23026818e-01  5.50553297e-01 -7.37967786e-01
 -4.55857870e-01  3.02645189e-01  5.50976872e-02  3.42905775e-01
 -1.26800735e-02  1.85524949e-0