In [17]:
# Step 1: Load training and testing data
def load_data_withoutcsv(training_path, testing_path):
    linestraining = [line.rstrip('\n') for line in open(training_path)]
    train_data = []
    for line in linestraining: 
        words = line.split(',')
        userid = int(words[0])
        itemid = int(words[1])
        rating = float(words[2])
        train_data.append([userid, itemid, rating])
    
    test_data = []
    linestesting = [line.rstrip('\n') for line in open(testing_path)]
    for line in linestesting:
        words = line.split(',')
        userid = int(words[0])
        itemid = int(words[1])
        test_data.append([userid, itemid])

    random.shuffle(train_data)
    split_index = int(0.8 * len(train_data))
    train_data, val_data = train_data[:split_index], train_data[split_index:]

    return train_data, val_data, test_data

In [18]:
# Step 2: Build the User-Item Rating Matrix
def build_user_item_matrix(train_data):
    users = sorted(set([d[0] for d in train_data]))
    items = sorted(set([d[1] for d in train_data]))

    user_map = {u: i for i, u in enumerate(users)}
    item_map = {i: j for j, i in enumerate(items)}

    matrix = np.zeros((len(users), len(items)))
    for user, item, rating in train_data:
        matrix[user_map[user], item_map[item]] = rating
    
    # Compute the User Average Ratings: 
    user_avg_ratings = np.zeros(len(users))
    for user in users:
        user_avg_ratings[user_map[user]] = np.mean(matrix[user_map[user], :])

    return matrix, user_map, item_map, user_avg_ratings

In [19]:
# Step 3: Compute the Cosine Similarity Matrix
import numpy as np
def adjusted_cosine_similarity(matrix, user_avg_ratings):
    # Compute the adjusted cosine similarity matrix
    adjusted_matrix = matrix - user_avg_ratings[:, None]
    similarity_matrix = adjusted_matrix.dot(adjusted_matrix.T)
    norms = np.linalg.norm(adjusted_matrix, axis=1)
    similarity_matrix = similarity_matrix / (np.linalg.norm(adjusted_matrix, axis=1)[:, None] @ np.linalg.norm(adjusted_matrix, axis=1)[:, None].T)    
    return similarity_matrix

def cosine_similarity(matrix):
    # Compute the cosine similarity matrix
    similarity_matrix = matrix.dot(matrix.T)
    similarity_matrix = similarity_matrix / (np.linalg.norm(matrix, axis=1)[:, None] @ np.linalg.norm(matrix, axis=1)[:, None].T)    
    return similarity_matrix


In [20]:
# Step 4: Predict Ratings
def predict_ratings(userid, itemid, matrix, similarity, user_map, item_map, k=10, similarity_threshold=0.2):
    if userid not in user_map or itemid not in item_map:
        return 3.0

    user_index = user_map[userid]
    item_index = item_map[itemid]

    if item_index >= similarity.shape[0]:  # Prevent index out-of-bounds error
        return 3.0
    
    # Get similarities for the target item
    item_similarity_scores = similarity[item_index]

    # Find items the user has rated
    rated_items = np.where(matrix[user_index, :] > 0)[0]

    if len(rated_items) == 0:
        return 3.0  # Default rating
    
    # Ensure indices are within valid range
    valid_rated_items = rated_items[rated_items < similarity.shape[0]]

    if len(valid_rated_items) == 0:
        return 3.0

    # Sort items by similarity and select top-k
    sorted_items = valid_rated_items[np.argsort(item_similarity_scores[valid_rated_items])[::-1]]
    top_k_items = [item for item in sorted_items if item_similarity_scores[item] > similarity_threshold][:k]

    # Compute weighted sum of ratings
    numerator = sum([similarity[item_index, item] * matrix[user_index, item] for item in top_k_items])
    denominator = sum([similarity[item_index, item] for item in top_k_items])

    return round(numerator / denominator) if denominator != 0 else 3.0


In [21]:
# Step 5: Generate Predictions for the Test Data
def gen_preds(test_dataset, matrix, similarity, user_map, item_map, output):
    preds = []
    for user, item in test_dataset:
        pred = predict_ratings(user, item, matrix, similarity, user_map, item_map)
        pred = min(max(pred, 0.5), 5.0)
        preds.append([user, item, round(pred, 3)])

    with open(output, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['userid', 'itemid', 'predicted_rating'])
        writer.writerows(preds) 

In [22]:
# Helper Functions for Evaluation: 
def MAE(y_pred, y_true):
    return np.mean(np.abs(np.array(y_pred) - np.array(y_true)))

def RMSE(y_pred, y_true):
    return np.sqrt(np.mean((np.array(y_pred) - np.array(y_true))**2))


In [23]:
import csv
import numpy as np
import random
train_dataset =  'C:/Users/semel/Downloads/socialcomp/train_100k_withratings.csv'
test_dataset =  'C:/Users/semel/Downloads/socialcomp/test_100k_withoutratings.csv'
# Create the output file:
output = 'output.csv'

# def case_amp(similarity_matrix, alpha= 2.5):
#   return np.sign(similarity_matrix) * np.abs(similarity_matrix) ** alpha


train_data, validation_data, test_data = load_data_withoutcsv(train_dataset, test_dataset)
print ('Datasets found, data loaded')

def variance_weight(similarity_matrix ,matrix):
    item_variance = np.var(matrix, axis=0)
    weight_matrix = np.sqrt(item_variance[:, None] @ item_variance[None, :])
    return similarity_matrix * weight_matrix

matrix, user_map, item_map, user_averages = build_user_item_matrix(train_data)
print ('User-item matrix created')

similarity = adjusted_cosine_similarity(matrix, user_averages)
print ('Similarity computed')
print ('Similarity adjusted')





val_true = []
val_pred = []

from itertools import product

# Define hyperparameter ranges
# Number of neighbors (starts from k=5 and is multiples of 5)
k_values = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 150, 200, 250, 300, 350, 400, 450,500]
print (len(k_values))
# Similarity threshold (starts from k = 0.005 and is multiples of 0.005)
similarity_thresholds = [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04, 0.045, 0.05, 0.055, 0.06, 0.065, 0.07, 0.075, 0.08, 0.085, 0.09, 0.095, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
print (len(similarity_thresholds))
# Store results
best_params = None
best_mae = float('inf')
results = []

# Perform grid search
for k, sim_threshold in product(k_values, similarity_thresholds):
    val_true = []
    val_pred = []

    # Evaluate on validation set
    for user, item, rating in validation_data:
        predicted_rating = predict_ratings(user, item, matrix, similarity, user_map, item_map, k=k, similarity_threshold=sim_threshold)
        val_true.append(rating)
        val_pred.append(predicted_rating)

    # Compute evaluation metrics
    mae = MAE(val_pred, val_true)
    rmse = RMSE(val_pred, val_true)

    print (f"k={k}, similarity_threshold={sim_threshold}, MAE={mae:.3f}, RMSE={rmse:.3f}")

    # Store result
    results.append((k, sim_threshold, mae, rmse))

    # Check for best parameters
    if mae < best_mae:
        best_mae = mae
        best_params = (k, sim_threshold)

# Display results
import pandas as pd

# Print best hyperparameters
print(f"Best hyperparameters: k={best_params[0]}, similarity_threshold={best_params[1]}")
print(f"Best MAE: {best_mae:.3f}")


Datasets found, data loaded
User-item matrix created
Similarity computed
Similarity adjusted
28
28
k=5, similarity_threshold=0.005, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.01, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.015, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.02, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.025, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.03, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.035, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.04, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.045, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.05, MAE=0.879, RMSE=1.174
k=5, similarity_threshold=0.055, MAE=0.879, RMSE=1.174


KeyboardInterrupt: 

In [None]:
# Step 7: Once the model is validated, train on the entire dataset and generate predictions
# matrix, user_map, item_map = build_user_item_matrix(train_data + validation_data)
# similarity = pearson_coefficient(matrix)
# gen_preds(test_data, matrix, similarity, user_map, item_map, output)
# print (f"Predictions generated and saved to: {output}") '''