In [5]:
import os
os.environ["MKL_NUM_THREADS"] = "1"

from threadpoolctl import threadpool_limits
threadpool_limits(1, "blas")  # Limits BLAS to a single thread

<threadpoolctl.threadpool_limits at 0x26da486acc0>

In [6]:
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

In [7]:
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import json

#Step 1: Read csv files
#WIN
games_df = reduce_memory(pd.read_csv(r'C:\Users\richa\Documents\UCI\CS 271P\games.csv'))
rec_df = reduce_memory(pd.read_csv(r'C:\Users\richa\Documents\UCI\CS 271P\recommendations.csv'))
with open(r'C:\Users\richa\Documents\UCI\CS 271P\games_metadata.json', encoding="utf-8") as file:
     metadata = [json.loads(line) for line in file]
metadata_df = pd.DataFrame(metadata)

games_df = games_df.head(10000)
rec_df = rec_df[rec_df['app_id'].isin(games_df['app_id'])]

games_df = games_df[games_df['app_id'].isin(rec_df['app_id'])] #to ensure all games exist in rec_df
metadata_df = metadata_df[metadata_df['app_id'].isin(rec_df['app_id'])]

#MACOS
# games_df = pd.read_csv('/Users/richardlw/Documents/UCI/CS 271P/games-rec-system/test_dataset/games_df_test.csv')
# rec_df = pd.read_csv('/Users/richardlw/Documents/UCI/CS 271P/games-rec-system/test_dataset/rec_df_test.csv')
# metadata_df = pd.read_csv('/Users/richardlw/Documents/UCI/CS 271P/games-rec-system/test_dataset/metadata_df_test.csv')



In [8]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Step 2: Filter Active Users and Games
# Keep users with >100 reviews
user_activity = rec_df['user_id'].value_counts()

filtered_users = user_activity[user_activity > 100].index

filtered_rec_df = rec_df[
    rec_df['user_id'].isin(filtered_users)
]

filtered_rec_df = filtered_rec_df.copy()
filtered_rec_df['hours'] = np.log1p(filtered_rec_df['hours'])

# Split the data into train and test sets
train_fraction = 0.8
train_df = filtered_rec_df.sample(frac=train_fraction, random_state=42)
test_df = filtered_rec_df.drop(train_df.index)

test_df = test_df[test_df['app_id'].isin(train_df['app_id'])]

# Step 3: Create the User-Item Interaction Matrix
# Use hours played as confidence scores
interaction_sparse = csr_matrix(
    (
        train_df['hours'],  # Confidence score
        (
            train_df['user_id'].astype('category').cat.codes,
            train_df['app_id'].astype('category').cat.codes
        )
    )
)

# Create mappings for user and game IDs
user_mapping = dict(enumerate(train_df['user_id'].astype('category').cat.categories))
game_mapping = dict(enumerate(train_df['app_id'].astype('category').cat.categories))
reverse_user_mapping = {v: k for k, v in user_mapping.items()}
reverse_game_mapping = {v: k for k, v in game_mapping.items()}


In [9]:
# Step 4: Train the ALS Model
# Initialize ALS model with hyperparameters
als_model = AlternatingLeastSquares(factors=100, regularization=0.05, iterations=20, use_gpu=False)

# Fit the model to the interaction data
als_model.fit(interaction_sparse)

  0%|          | 0/20 [00:00<?, ?it/s]

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Step 5: Compute Metadata Similarity
# Convert tags and description to strings and combine
#metadata_df = metadata_df[metadata_df['app_id'].isin(train_df['app_id'])]

metadata_df['tags'] = metadata_df['tags'].fillna('')
metadata_df['description'] = metadata_df['description'].fillna('')

metadata_df['tags'] = metadata_df['tags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
metadata_df['description'] = metadata_df['description'].fillna('').astype(str)
metadata_df['combined_metadata'] = metadata_df['tags'] + ' ' + metadata_df['description']

# Use TF-IDF to compute metadata similarities
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(metadata_df['combined_metadata'])

# Compute cosine similarity between items
metadata_similarity = cosine_similarity(tfidf_matrix)

# Normalize similarity matrix
scaler = MinMaxScaler()
metadata_similarity_normalized = scaler.fit_transform(metadata_similarity)

In [11]:
# Step 6: Create function for hybrid recommendation system
def hybrid_recommendations(user_id, als_model, metadata_similarity, alpha=0.5, N=10):
    # Check if user exists in training data
    if user_id not in user_mapping.values():
        return []

    user_index = reverse_user_mapping[user_id]

    # Generate ALS recommendations
    recommended = als_model.recommend(
        userid=user_id,
        user_items=interaction_sparse[user_index],
        N=N,
        filter_already_liked_items=True,
        recalculate_user=True
    )

    # Extract ALS recommendations and scores
    als_recommendations = recommended[0]  # Indices of recommended items
    als_scores = recommended[1]  # Corresponding ALS scores

    # Compute metadata similarity scores for recommended items
    hybrid_scores = []

    for idx, als_item in enumerate(als_recommendations):
        metadata_scores = metadata_similarity[als_item, als_recommendations]
        hybrid_score = alpha * als_scores[idx] + (1 - alpha) * metadata_scores.mean()
        hybrid_scores.append(hybrid_score)

    # Combine and sort by hybrid score
    hybrid_recommendations = [
        game_mapping[item]
        for item, _ in sorted(zip(als_recommendations, hybrid_scores), key=lambda x: x[1], reverse=True)
    ]

    # Map app IDs to titles using games_df
    app_titles = games_df.set_index('app_id').loc[hybrid_recommendations, 'title'].tolist()

    return app_titles[:N]

# Step 7: Generate Hybrid Recommendations for a Sample User
sample_user_id = train_df['user_id'].iloc[0]  # Replace with actual user ID
alpha = 0.6  # Weight for ALS
N = 10  # Number of recommendations
recommendations = hybrid_recommendations(sample_user_id, als_model, metadata_similarity_normalized, alpha, N)

print(f"Game recommendations for User {sample_user_id}:\n" + "\n".join(recommendations))

Game recommendations for User 5136021:
Two Worlds Epic Edition
How to Survive 2
Ninja Stealth
Space Pilgrim Episode III: Delta Pavonis
Razortron 2000
BADLAND: Game of the Year Edition
APB Reloaded
Urban Trial Freestyle
Shower With Your Dad Simulator 2015: Do You Still Shower With Your Dad
Zombie Army Trilogy


In [13]:
# Function to evaluate hit ratio for the hybrid recommendation system
# Revised evaluation function
def evaluate_hit_ratio_hybrid(test_df, N=10, alpha=0.6):
    title_to_app_id = games_df.set_index('title')['app_id'].to_dict()

    users_test = test_df['user_id'].unique()  # Unique users in the test set
    hit_count = 0

    for user_id in users_test:
        # Generate hybrid recommendations using the hybrid_recommendations function
        recommended_games = hybrid_recommendations(user_id, als_model, metadata_similarity_normalized, alpha, N)

        recommended_games = [title_to_app_id[title] for title in recommended_games if title in title_to_app_id]

        # Get the user's actual test interactions from test_df
        user_test_games = test_df[test_df['user_id'] == user_id]['app_id'].values

        # Check if any recommended game matches the user's test games
        if any(game in user_test_games for game in recommended_games):
            hit_count += 1

    # Calculate Hit Ratio
    hit_ratio = hit_count / len(users_test)
    return hit_ratio


# Evaluate Hit Ratio for the hybrid recommendation system
hit_ratio_hybrid = evaluate_hit_ratio_hybrid(test_df, N=10, alpha=0.8)
print(f"Hit Ratio (Hybrid): {hit_ratio_hybrid:.4f}")


Hit Ratio (Hybrid): 0.7930
