# Files

In [16]:
import pandas as pd
import ast

In [2]:
path_1u = "datasets/user_data_1.csv"
path_1g = "datasets/games_data_1.csv"
path_2u = "datasets/user_data_2.csv"

In [18]:
def safe_literal_eval(s):
    """
    Safely evaluates a string containing a Python literal using ast.literal_eval.

    Parameters:
    - s: A string to be evaluated.

    Returns:
    - The evaluated Python literal if it's valid.
    - pass if the string is not a valid literal or if it contains NaN.
    """
    try:
        # Attempt to evaluate the string as a Python literal using ast.literal_eval
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # Handle exceptions (e.g., if the string is not a valid literal)
        return s  # Return the original value if it's not a valid literal

# Money spent

In [7]:
# Calling dataframes from files
users_dataframe = pd.read_csv(path_1u)
games_dataframe = pd.read_csv(path_1g)

In [8]:
users_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87629 entries, 0 to 87628
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_id      87629 non-null  object 
 1   items_count  87629 non-null  float64
 2   item_id      87629 non-null  object 
 3   recommend    25485 non-null  object 
dtypes: float64(1), object(3)
memory usage: 2.7+ MB


In [9]:
# Converting to its corresponding data types
#users_dataframe['item_id'] = users_dataframe['item_id'].apply(safe_literal_eval)
games_dataframe = games_dataframe.applymap(safe_literal_eval)

In [29]:
print(users_dataframe['user_id'][5000])

76561198061207064


In [39]:
# Filter the DataFrame for the specific user ID
user_data = users_dataframe.loc[users_dataframe['user_id'] == str("76561198061207064")]

# Check if any rows match the user ID
if not user_data.empty:
    # Extract values with error handling for NaN
    id_item = user_data['item_id'].values[0] if not user_data['item_id'].isna().any() else 0
    id_item = safe_literal_eval(id_item)
    id_recommend = user_data['recommend'].values[0] if not user_data['recommend'].isna().any() else 0
    id_recommend = safe_literal_eval(id_recommend)
    id_items_count = user_data['items_count'].values[0] if not user_data['items_count'].isna().any() else 0
else:
    # Handle the case where no matching rows were found for the user ID
    id_item = 0
    id_recommend = 0
    id_items_count = 0

print(id_item)
print(type(id_item))
print(id_recommend)
print(type(id_recommend))
print(id_items_count)
print(type(id_items_count))


[4000, 220, 320, 340, 360, 380, 400, 420, 2820, 32470, 105600, 34830, 72850, 50300, 34330, 51100, 99900, 109600, 113420, 212200, 218230, 236390, 243870, 209870, 248570, 216150, 223750, 246280, 253710, 263540, 244770, 244850, 107410, 214950, 266430, 221100, 271590, 273110, 274920, 301520, 304030, 304050, 304930, 311310, 291480, 312990, 268420, 227940, 313980, 317470, 205790, 323370, 10, 80, 100, 240, 730, 282070, 333950, 349110, 344860, 346010, 346110, 407530, 35450, 351640, 400250, 310380, 440090, 319630, 48700]
<class 'list'>
[True, True]
<class 'list'>
71.0
<class 'numpy.float64'>


In [37]:
# Function to calculate the total price for a single "id_item" and return a report
def money_spent(user_id):

    # Calling dataframes from files
    users_dataframe = pd.read_csv(path_1u)
    games_dataframe = pd.read_csv(path_1g)

    # Filter the DataFrame for the specific user ID
    user_data = users_dataframe.loc[users_dataframe['user_id'] == str(user_id)]

    # Check if any rows match the user ID
    if not user_data.empty:
        # Extract values with error handling for NaN
        id_item = user_data['item_id'].values[0] if not user_data['item_id'].isna().any() else 0
        id_item = safe_literal_eval(id_item)
        id_recommend = user_data['recommend'].values[0] if not user_data['recommend'].isna().any() else 0
        id_recommend = safe_literal_eval(id_recommend)
        id_items_count = user_data['items_count'].values[0] if not user_data['items_count'].isna().any() else 0
    else:
        # Handle the case where no matching rows were found for the user ID
        id_item = 0
        id_recommend = 0
        id_items_count = 0
    
    # Calculating total money spent
    total_price = 0

    if type(id_item) == list and len(id_item) != 0:
        for id_value in id_item:
            matching_rows = games_dataframe[games_dataframe['id'] == id_value]
            if not matching_rows.empty:
                total_price += matching_rows['price'].sum()

    # Calculating percentage
    num_rcmnd = 0

    if type(id_recommend) == list and len(id_recommend) != 0:
        for id_rcmnd in id_recommend:
            if id_rcmnd:
                num_rcmnd += 1

    if id_items_count != 0:
        percentage = 100*(num_rcmnd / id_items_count)
    else:
        percentage = 0

    # rounding values
    total_price = round(total_price, 2)
    percentage = round(percentage, 2)

    return total_price, percentage

In [41]:
a = money_spent("76561198061207064")
print(a)
print(type(a))

(661.63, 2.82)
<class 'tuple'>


In [156]:
len(a)

542

# Dates - recommendations

In [167]:
# Calling dataframes from files
dataframe1 = pd.read_csv(path_2u)

In [168]:
dataframe1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25485 entries, 0 to 25484
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    25485 non-null  object
 1   posted     25485 non-null  object
 2   recommend  25485 non-null  object
dtypes: object(3)
memory usage: 597.4+ KB


In [171]:
from datetime import datetime

In [238]:
def valid_date_string(date_str):
    try:
        # Attempt to parse the date string
        datetime.strptime(date_str, "%Y-%m-%d")
        return True  # It's a valid date in the "yyyy-mm-dd" format
    except ValueError:
        return False  # It's not a valid date in the expected format

In [239]:
def date_in_range(dates_list, start_date, end_date):
    # Initialize a list to store the valid dates
    idx_valid_dates = []
    v_in_list = False

    if len(dates_list) != 0:
        for indx_date, date_str in enumerate(dates_list):
            # Validating date type of date_str
            valid_date = valid_date_string(date_str)
            
            # Validating if date is in rage
            if valid_date:
                # Convert the date string to a datetime object
                date_obj = datetime.strptime(date_str, "%Y-%m-%d")
                if start_date <= date_obj <= end_date:
                    idx_valid_dates.append(indx_date)
                    v_in_list = True

    return v_in_list, idx_valid_dates

In [179]:
dataframe2 = dataframe1.head(5).copy()

In [240]:
def num_user_review(dates: str) -> int:
    """Check for the number of user that made a review and the percentage of recommendations."""

    # Calling dataframes from file
    recommendations_df = pd.read_csv(path_2u)
    
    # Converting into their respective data types
    recommendations_df = recommendations_df.applymap(safe_literal_eval)

    # Split the date range into start and end dates
    start_date, end_date = dates.split()

    try:
        # Convert start_date and end_date to datetime objects
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
    except ValueError:
        raise ValueError("Invalid date format in date_range. Use 'yyyy-mm-dd' format.")
    
    # Initialize counters
    count = 0
    positive = 0
    negative = 0

    # Iterating over rows of dataframe
    for idx_row, row in recommendations_df.iterrows():
        dates_result = date_in_range(row['posted'], start_date, end_date)
        list_verf = dates_result[0]
        idx_dates = dates_result[1]
        
        # Calculating number of users
        if v_in_list:
            count += 1
        
        # Calculating recommendations
        for idx in idx_valid_dates:
            if idx < len(row["recommend"]):
                if row["recommend"][idx]:
                    positive += 1
                else:
                    negative += 1
    
    # Calculating percentage based on total recommendations made within range of dates
    if positive != 0 and negative != 0:
        total_recommendations = positive + negative
        percentage = (positive / total_recommendations) * 100
    else: 
        percentage = 0

    return count, round(percentage, 2)

In [242]:
date_range = "2011-01-01 2014-12-31"

In [None]:
num_user_review(date_range)

# Genres ranking

In [276]:
import pandas as pd

# Sample data for users_dataframe and games_dataframe
users_data = {
    'item_id': [[1, 2, 4], [2, 3], [3, 4, 5]],
    'playtime_forever': [[100, 200, 150], [300, 250], [50, 100, 75]]
}

games_data = {
    'id': [1, 2, 3],
    'genres': [['Action', 'Adventure'], ['Adventure', 'RPG'], ['Action', 'Co-op', 'Multiplayer']]
}

users_dataframe = pd.DataFrame(users_data)
games_dataframe = pd.DataFrame(games_data)

In [277]:
users_dataframe

Unnamed: 0,item_id,playtime_forever
0,"[1, 2, 4]","[100, 200, 150]"
1,"[2, 3]","[300, 250]"
2,"[3, 4, 5]","[50, 100, 75]"


In [270]:
games_dataframe

Unnamed: 0,id,genres
0,1,"[Action, Adventure]"
1,2,"[Adventure, RPG]"
2,3,[Action]


In [272]:
merged_df_1 = users_dataframe.explode('item_id')
merged_df_1

Unnamed: 0,item_id,playtime_forever
0,1,"[100, 200, 150]"
0,2,"[100, 200, 150]"
1,2,"[300, 250]"
2,3,"[50, 100, 75]"
2,4,"[50, 100, 75]"


In [274]:
# Step 1: Merge users_dataframe and games_dataframe
merged_df = merged_df_1.explode('playtime_forever').merge(games_dataframe, left_on='item_id', right_on='id', how='left')

In [246]:
merged_df

Unnamed: 0,item_id,playtime_forever,id,genres
0,1,100,1,"[Action, Adventure]"
1,1,200,1,"[Action, Adventure]"
2,1,150,1,"[Action, Adventure]"
3,2,300,2,"[Adventure, RPG]"
4,2,250,2,"[Adventure, RPG]"
5,3,50,3,[Action]
6,3,100,3,[Action]
7,3,75,3,[Action]


In [249]:
# Step 2: Explode the list of genres
merged_df = merged_df.explode('genres')

In [250]:
merged_df

Unnamed: 0,item_id,playtime_forever,id,genres
0,1,100,1,Action
0,1,100,1,Adventure
1,1,200,1,Action
1,1,200,1,Adventure
2,1,150,1,Action
2,1,150,1,Adventure
3,2,300,2,Adventure
3,2,300,2,RPG
4,2,250,2,Adventure
4,2,250,2,RPG


In [251]:
# Step 3: Group by genre and calculate total time spent
genre_ranking = merged_df.groupby('genres')['playtime_forever'].sum().reset_index()

In [252]:
genre_ranking

Unnamed: 0,genres,playtime_forever
0,Action,675
1,Adventure,1000
2,RPG,550


In [253]:
# Step 4: Create the final DataFrame for genre ranking
genre_ranking = genre_ranking.rename(columns={'genres': 'genre', 'playtime_forever': 'time_spent'})

In [254]:
genre_ranking

Unnamed: 0,genre,time_spent
0,Action,675
1,Adventure,1000
2,RPG,550


In [256]:
# Sort the genre ranking DataFrame by total time spent
genre_ranking = genre_ranking.sort_values(by='time_spent', ascending=False)

In [257]:
genre_ranking

Unnamed: 0,genre,time_spent
1,Adventure,1000
0,Action,675
2,RPG,550


In [258]:
# Reset the index
genre_ranking = genre_ranking.reset_index(drop=True)

In [259]:
genre_ranking

Unnamed: 0,genre,time_spent
0,Adventure,1000
1,Action,675
2,RPG,550


In [6]:
import pandas as pd

data = {'genre': ['Action', 'Adventure', 'RPG'],
        'user_info': [
            [{'user_id': '76561198011705037', 'time_spent': 366, 'user_url': "nan"},
 {'user_id': '76561198073356393', 'time_spent': 161, 'user_url': "nan"},
 {'user_id': 'Omega67679',
  'time_spent': 0,
  'user_url': 'http://steamcommunity.com/id/Omega67679'},
 {'user_id': '76561198047202602', 'time_spent': 473, 'user_url': "nan"},
 {'user_id': 'screwlabs', 'time_spent': 1666, 'user_url': "nan"},
 {'user_id': 'ToastedFiggin',
  'time_spent': 87,
  'user_url': 'http://steamcommunity.com/id/ToastedFiggin'},
 {'user_id': '76561198060739596', 'time_spent': 37, 'user_url': "nan"},
 {'user_id': '76561198065984534',
  'time_spent': 1439,
  'user_url': 'http://steamcommunity.com/profiles/76561198065984534'},
 {'user_id': 'Qualitybutthole',
  'time_spent': 1324,
  'user_url': 'http://steamcommunity.com/id/Qualitybutthole'},
 {'user_id': 'Nathstar', 'time_spent': 474, 'user_url': "nan"},
 {'user_id': 'drazsyker',
  'time_spent': 0,
  'user_url': 'http://steamcommunity.com/id/drazsyker'},
 {'user_id': '76561198027728967', 'time_spent': 0, 'user_url': "nan"},
 {'user_id': 'wandlustn', 'time_spent': 758, 'user_url': "nan"},
 {'user_id': 'allaboutthatchuckbass', 'time_spent': 1856, 'user_url': "nan"}],
            [{'user_id': 102, 'time_spent': 200, 'user_url': 'url2'}, {'user_id': 104, 'time_spent': 300, 'user_url': 'url4'}],
            [{'user_id': 105, 'time_spent': 250, 'user_url': 'url5'}]
        ]}

df = pd.DataFrame(data)
df

Unnamed: 0,genre,user_info
0,Action,"[{'user_id': '76561198011705037', 'time_spent'..."
1,Adventure,"[{'user_id': 102, 'time_spent': 200, 'user_url..."
2,RPG,"[{'user_id': 105, 'time_spent': 250, 'user_url..."


In [10]:
path_4 = r"S:\Courses\Data Science\0. Henry\M7 - Labs - Project MLOps _ steam\game-recommendation-system\api\datasets\user_data_4.csv"

In [11]:
# Calling dataframes from files
file_4 = pd.read_csv(path_4)

In [97]:
def top_users_in_genre(genre: str) -> list:
    """Check for the top 5 users with the most time spended in a given genre."""
    # Load data from a CSV file into a DataFrame (adjust path_2u to your file path)
    genre_users_df = pd.read_csv(path_4)

    # Check if the specified genre exists in the DataFrame
    if genre not in genre_users_df['genre'].values:
        return f"Genre '{genre}' not found in the DataFrame."
    
    # Filter the DataFrame by the specified genre
    genre_filter_df = genre_users_df[genre_users_df['genre'] == genre]

    genre_filter_df_1 = genre_filter_df.copy()
    # Replace 'nan' with None
    genre_filter_df_1['users_info'] = genre_filter_df_1['users_info'].str.replace('nan', 'None')

    # Convert the string to a list of dictionaries using ast.literal_eval
    genre_filter_df_1['users_info'] = genre_filter_df_1['users_info'].apply(lambda x: ast.literal_eval(x) if x is not None else [])

    # Sort the list of users by time_spent in descending order
    sorted_users = sorted(genre_filter_df_1['users_info'][0], key=lambda x: x['time_spent'], reverse=True)
    
    # Get the top 5 users
    top_users = sorted_users[:5]
    
    return top_users

In [95]:
def top_users_by_genre(genre: str) -> list:
    """Check for the top 5 users with the most time spended in a given genre."""
    # Load data from a CSV file into a DataFrame (adjust path_2u to your file path)
    genre_users_df = pd.read_csv(path_4)

    genre_users_df['genre'] = genre_users_df['genre'].apply(safe_literal_eval)
    
    # Check if the specified genre exists in the DataFrame
    if str(genre) not in genre_users_df['genre'].values:
        return f"Genre '{genre}' not found in the DataFrame."
    
    # Filter the DataFrame by the specified genre
    genre_filter_df = genre_users_df[genre_users_df['genre'] == str(genre)]

    genre_filter_list = genre_filter_df["users_info"][0]

    # Replace 'nan' with None
    genre_filter_list = genre_filter_list.replace('nan', 'None')

    # Convert the string to a list of dictionaries using ast.literal_eval
    genre_filter_list_eval = safe_literal_eval(genre_filter_list)

    # Sort the list of users by time_spent in descending order
    sorted_users = sorted(genre_filter_list_eval, key=lambda x: x['time_spent'], reverse=True)
    
    # Get the top 5 users
    top_users = sorted_users[:5]
    
    return top_users

In [99]:
# Example usage:
genre = 'Action'
top_action_users = top_users_by_genre(genre)

In [100]:
top_action_users

[{'user_id': '76561198013535175',
  'time_spent': 187287,
  'user_url': 'http://steamcommunity.com/profiles/76561198013535175'},
 {'user_id': '76561198053666288', 'time_spent': 140609, 'user_url': None},
 {'user_id': 'lalup',
  'time_spent': 117496,
  'user_url': 'http://steamcommunity.com/id/lalup'},
 {'user_id': 'Adrian_31',
  'time_spent': 112132,
  'user_url': 'http://steamcommunity.com/id/Adrian_31'},
 {'user_id': '76561198018025756', 'time_spent': 111010, 'user_url': None}]