In [1]:

import pandas as pd
from surprise import Dataset, NormalPredictor, Reader, SVD, accuracy
from surprise.model_selection import cross_validate
import numpy as np
from sklearn.linear_model import LogisticRegression


from train_valid_test_loader import load_train_valid_test_datasets

# Load the dataset in the same way as the main problem 
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
        load_train_valid_test_datasets()

users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')

In [2]:
def tuple_to_surprise_dataset(tupl):
    """
    This function convert a subset in the tuple form to a `surprise` dataset. 
    """
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

    return dataset

In [3]:
# build SVD
trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x119c7bf10>

In [4]:
print(algo.pu.shape)
print(algo.qi.shape)

(943, 100)
(1626, 100)


In [5]:
# "pseudocode", courtesy of Liping
def run_test(pair):

    [i, j] = pair
    try:
        u_features = algo.pu[trainset.to_inner_uid(i)] 
    except ValueError:
        u_features = np.zeros(100)


    try:
        i_features = algo.qi[trainset.to_inner_iid(j)]
    except ValueError:
        i_features = np.zeros(100)

    user_data = users_df.loc[users_df['user_id'] == i]
    user_metadata = np.array([user_data['age'].iloc[0], user_data['is_male'].iloc[0]])
    item_metadata = np.array([movies_df.loc[movies_df['item_id'] == j, 'release_year'].iloc[0]])

    return np.concatenate([u_features, i_features, user_metadata, item_metadata])

In [6]:
train_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
# Convert DataFrame to list of tuples
train_pairs = list(train_df.itertuples(index=False, name=None))
new_train_pairs = []

ratings = []

for pair in train_pairs:
    new_train_pairs.append([pair[0], pair[1]])
    ratings.append(pair[2])

total_df = np.empty((89992, 203))

for i, pair in enumerate(new_train_pairs):
   total_df[i] = run_test(pair)

y_vals = []

for r in ratings:
    y_vals.append(r > 4)

In [7]:
# clf = LogisticRegression(max_iter=1000)
# clf.fit(total_df, y_vals)

In [8]:
# from sklearn.ensemble import RandomForestClassifier

# # Assuming total_df is your feature matrix and y_vals are your labels
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(total_df, y_vals)

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Assuming total_df is your feature matrix and y_vals are your labels, which are binary
# y_vals are set to True if the rating is greater than 4, and False otherwise

# Initialize the Gradient Boosting Classifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)



# .53 accuracy


# Fit the model on the training data
clf.fit(total_df, y_vals)



In [10]:
# Read the file using pandas
test_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')
# Convert DataFrame to list of tuples
test_pairs = list(test_df.itertuples(index=False, name=None))


new_test_pairs = []

for pair in test_pairs:
    new_test_pairs.append([pair[0], pair[1]])

test_df = np.empty((10000, 203))


for i, pair in enumerate(new_test_pairs):
   test_df[i] = run_test(pair)

In [11]:
total = clf.predict(test_df)

In [12]:
np.savetxt('predicted_ratings_leaderboard.txt', total, fmt='%d', delimiter=',')