In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [11]:
users = pd.read_csv('users.csv')
games = pd.read_csv('games.csv')
recs = pd.read_csv('recommendations.csv')

#remove users with less than 2 reviews
#users = users[users['reviews'] > 1]
users.info

<bound method DataFrame.info of            user_id  products  reviews
0          7360263       359        0
1         14020781       156        1
2          8762579       329        4
3          4820647       176        4
4          5167327        98        2
...            ...       ...      ...
14306059   5047430         6        0
14306060   5048153         0        0
14306061   5059205        31        0
14306062   5074363         0        0
14306063   5081164         0        0

[14306064 rows x 3 columns]>

In [3]:
#games = games.drop(columns=['price_original', 'discount', 'steam_deck'], axis=1)
#recs = recs.drop(columns=['helpful', 'funny', 'date', 'review_id'], axis=1)



In [4]:
games.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99


In [5]:
# Create user-game recommendation matrix
def create_usergame_rec_df(users, games, recs, sample_size=10000):
    #get samples instead of processing whole data for speed
    recs_samples = recs.sample(n=sample_size, random_state=42069)

    #create user-game recommendation matrix
    ugr = recs_samples.pivot(
            index='user_id',
            columns='app_id',
            values='is_recommended'
            ).fillna(0)
    
    ugr = ugr.astype(np.int8)
    return ugr
    
# Create user-game playtime matrix

ugr=create_usergame_rec_df(users, games, recs)

sparse_ugr = csr_matrix(ugr.values)
print("Shape:",sparse_ugr.shape)
print("Non-zero elems", sparse_ugr.nnz)

Shape: (9967, 2940)
Non-zero elems 8549


In [6]:
recs.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,1,36.3,51580,0
1,304390,4,0,2017-02-17,0,11.5,2586,1
2,1085660,2,0,2019-11-17,1,336.5,253880,2
3,703080,0,0,2022-09-23,1,27.4,259432,3
4,526870,0,0,2021-01-10,1,7.9,23869,4


In [7]:
class GameRecommender:
    def __init__(self, interactions_matrix, game_df):
        self.interactions_matrix = interactions_matrix
        self.game_df = game_df
        self.model = NearestNeighbors(metric='cosine', algorithm='brute')
        self.model.fit(interactions_matrix)
    
    #collaborative filtering function to return top 10 nearest neighbours 
    #of users with similar game recommendations
    def get_recommendations(self, user_idx, n_recommendations=5):
        distances, indices = self.model.kneighbors(
            self.interactions_matrix[user_idx].reshape(1, -1),
            n_neighbors=n_recommendations+1
        )
        
        similar_users = indices.flatten()[1:]
        recommended_games = []
        
        for user in similar_users:
            user_games = self.interactions_matrix[user].nonzero()[1]
            for game in user_games:
                if self.interactions_matrix[user_idx, game] == 0:  # User hasn't interacted with this game
                    game_info = self.game_df[self.game_df.index == game].iloc[0]
                    recommended_games.append({
                        'title': game_info['title'],
                        'rating': game_info['rating'],
                        'price': game_info['price_final']
                    })
        
        return recommended_games[:n_recommendations]




In [8]:
class ImprovedGameRecommender:
    def __init__(self, interactions_matrix, game_df):
        self.interactions_matrix = interactions_matrix
        self.game_df = game_df
        
        # Create game mappings based on the columns in interactions_matrix
        self.game_indices = dict(enumerate(range(interactions_matrix.shape[1])))
        self.app_id_to_index = {v: k for k, v in self.game_indices.items()}
        
        # Compute game popularity (normalized)
        self.game_popularity = np.array(interactions_matrix.sum(axis=0)).flatten()
        self.game_popularity = (self.game_popularity - self.game_popularity.min()) / (
            self.game_popularity.max() - self.game_popularity.min() + 1e-10)
        
        # Use user-user similarity instead of item-item
        self.model = NearestNeighbors(metric='cosine', algorithm='brute')
        self.model.fit(interactions_matrix)
        
        print(f"Recommender initialized with {interactions_matrix.shape[0]} users and {interactions_matrix.shape[1]} games")
    
    def get_recommendations(self, user_idx, n_recommendations=5):
        user_profile = self.interactions_matrix[user_idx]
        
        if isinstance(user_profile, np.ndarray):
            user_profile = csr_matrix(user_profile)
        
        # Find similar users
        distances, indices = self.model.kneighbors(
            user_profile,
            n_neighbors=min(50, self.interactions_matrix.shape[0])
        )
        
        # Get games the user hasn't interacted with
        user_games = set(user_profile.nonzero()[1])
        all_games = set(range(self.interactions_matrix.shape[1]))
        candidate_games = list(all_games - user_games)
        
        if not candidate_games:
            return []
        
        # Calculate scores for candidate games
        scores = np.zeros(len(candidate_games))
        similar_users = indices[0][1:]  # Exclude the user themselves
        similar_distances = distances[0][1:]
        
        for i, game_idx in enumerate(candidate_games):
            # Weight recommendations by similarity
            game_ratings = self.interactions_matrix[similar_users, game_idx].toarray().flatten()
            if game_ratings.sum() > 0:
                similarity_score = np.dot(1 - similar_distances, game_ratings)
                popularity_score = self.game_popularity[game_idx]
                scores[i] = (0.7 * similarity_score) + (0.3 * popularity_score)
            else:
                scores[i] = 0.3 * self.game_popularity[game_idx]
        
        # Get top recommendations
        if scores.max() == 0:
            # If no scores, recommend popular games
            scores = self.game_popularity[candidate_games]
            
        top_game_indices = np.array(candidate_games)[np.argsort(-scores)[:n_recommendations]]
        
        recommendations = []
        for game_idx in top_game_indices:
            game_info = self.game_df.iloc[game_idx]
            recommendations.append({
                'app_id': game_info['app_id'],
                'title': game_info['title'],
                'rating': game_info['rating'],
                'price': game_info['price_final'],
                'positive_ratio': game_info['positive_ratio'],
                'user_reviews': game_info['user_reviews'],
                'score': float(scores[list(candidate_games).index(game_idx)])
            })
        
        return recommendations

def prepare_improved_recommendation_data(user_df, game_df, recommendation_df, sample_size=50000):
    """Prepare the recommendation data with better filtering"""
    print("Sampling and preparing data...")
    
    # Sample recommendations
    recommendation_sample = recommendation_df.sample(n=sample_size, random_state=42)
    print(f"Initial sample size: {len(recommendation_sample)}")
    
    # Convert to integer and add positive/negative weight
    recommendation_sample['is_recommended'] = recommendation_sample['is_recommended'].astype(int)
    
    # Filter users and games
    user_interactions = recommendation_sample['user_id'].value_counts()
    game_interactions = recommendation_sample['app_id'].value_counts()
    
    min_user_interactions = 2
    min_game_interactions = 2
    
    valid_users = user_interactions[user_interactions >= min_user_interactions].index
    valid_games = game_interactions[game_interactions >= min_game_interactions].index
    
    filtered_recommendations = recommendation_sample[
        (recommendation_sample['user_id'].isin(valid_users)) &
        (recommendation_sample['app_id'].isin(valid_games))
    ]
    
    print(f"Filtered recommendations: {len(filtered_recommendations)}")
    print(f"Unique users: {len(valid_users)}")
    print(f"Unique games: {len(valid_games)}")
    
    # Create interaction matrix
    print("Creating interaction matrix...")
    interactions = filtered_recommendations.pivot(
        index='user_id',
        columns='app_id',
        values='is_recommended'
    ).fillna(0)
    
    return interactions.astype(np.int8)

def display_recommendations(user_id, recommender, interactions):
    """Display recommendations with improved formatting and error handling"""
    try:
        user_idx = interactions.index.get_loc(user_id)
        recommendations = recommender.get_recommendations(user_idx)
        
        if not recommendations:
            return "No recommendations found for this user."
        
        df = pd.DataFrame(recommendations)
        df = df[['title', 'rating', 'price', 'positive_ratio', 'user_reviews', 'score']]
        df['positive_ratio'] = df['positive_ratio'].apply(lambda x: f"{x:.1%}")
        df['score'] = df['score'].apply(lambda x: f"{x:.3f}")
        df['price'] = df['price'].apply(lambda x: f"${x:.2f}")
        return df
    
    except KeyError:
        return "User not found in the sample dataset"
    except Exception as e:
        return f"Error generating recommendations: {str(e)}"


In [9]:

# Create interaction matrix
print("Preparing recommendation data...")
interactions = prepare_improved_recommendation_data(users, games, recs)
interactions_sparse = csr_matrix(interactions.values)

print("\nInitializing recommender...")
recommender = ImprovedGameRecommender(interactions_sparse, games)

# Test recommendations
print("\nTesting recommendations...")
sample_user_ids = list(interactions.index[:5])
for user_id in sample_user_ids:
    print(f"\nRecommendations for user {user_id}:")
    recommendations_df = display_recommendations(user_id, recommender, interactions)
    print(recommendations_df)

# Print system statistics
print("\nSystem Statistics:")
print(f"Number of users in sample: {interactions.shape[0]}")
print(f"Number of games in sample: {interactions.shape[1]}")
print(f"Matrix sparsity: {100 * (1 - interactions_sparse.nnz / (interactions.shape[0] * interactions.shape[1])):.2f}%")
print(f"Average interactions per user: {interactions_sparse.sum() / interactions.shape[0]:.2f}")

# Analyze recommendation diversity
def analyze_recommendations(recommender, interactions, n_users=100):
    print("\nAnalyzing recommendation diversity...")
    all_recommended_games = []
    unique_ratings = set()
    prices = []
    
    sample_users = np.random.choice(interactions.shape[0], min(n_users, interactions.shape[0]), replace=False)
    
    for user_idx in sample_users:
        recommendations = recommender.get_recommendations(user_idx)
        for rec in recommendations:
            all_recommended_games.append(rec['title'])
            unique_ratings.add(rec['rating'])
            prices.append(rec['price'])
    
    print(f"\nUnique games recommended: {len(set(all_recommended_games))}")
    print(f"Unique ratings in recommendations: {len(unique_ratings)}")
    print(f"Average recommended game price: ${np.mean(prices):.2f}")
    print(f"Price range: ${min(prices):.2f} - ${max(prices):.2f}")

# Run diversity analysis
analyze_recommendations(recommender, interactions)

Preparing recommendation data...
Sampling and preparing data...
Initial sample size: 50000
Filtered recommendations: 806
Unique users: 508
Unique games: 3511
Creating interaction matrix...

Initializing recommender...
Recommender initialized with 454 users and 643 games

Testing recommendations...

Recommendations for user 7744:
                                          title         rating   price  \
0      World of Tanks — Rugged Mountaineer Pack       Positive  $49.99   
1                                    Magitek VR       Positive  $14.99   
2  STCC The Game 2 – Expansion Pack for RACE 07       Positive   $3.99   
3                                     Macbat 64  Very Positive   $1.99   
4         Borderlands 3: Gold Weapon Skins Pack          Mixed   $4.99   

  positive_ratio  user_reviews  score  
0        8300.0%            37  0.500  
1        8700.0%            16  0.425  
2        9200.0%            13  0.300  
3        9100.0%           282  0.300  
4        5100.0%        

In [10]:
# Initialize recommender
recommender = GameRecommender(sparse_ugr, games)
yes = recommender.get_recommendations(0, 10)
yes


[{'title': 'Inquisitor', 'rating': 'Mostly Positive', 'price': 9.99},
 {'title': 'Beckett', 'rating': 'Very Positive', 'price': 3.49},
 {'title': 'Gigapocalypse', 'rating': 'Very Positive', 'price': 9.99},
 {'title': 'Far Cry® New Dawn', 'rating': 'Mostly Positive', 'price': 39.99},
 {'title': 'CounterAttack', 'rating': 'Very Positive', 'price': 14.99},
 {'title': 'PAYDAY 2: Golden Dagger Tailor Pack',
  'rating': 'Very Positive',
  'price': 2.99},
 {'title': 'Clash of Chefs VR', 'rating': 'Mostly Positive', 'price': 19.99}]

Extract simple exploratory data.

Such as reviews per day, game releases per day.

Plot min max avg 

Games by platform

Rating distribution

Game playtime, price, 