In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
movies_data = load_dataset("ashraq/movielens_ratings", split="train")
games_data = load_dataset("FronkonGames/steam-games-dataset", split="train")

In [4]:
movie_lens_df_master = pd.DataFrame(movies_data)
steam_games_df_master = pd.DataFrame(games_data)

In [5]:
movie_lens_df = movie_lens_df_master.copy()
steam_games_df = steam_games_df_master.copy()

In [6]:
print("MovieLens Dataset Preview:")
print(movie_lens_df.head())

MovieLens Dataset Preview:
      imdbId  tmdbId  movie_id  user_id  rating  \
0  tt2096673  150540      2307    11923     3.5   
1  tt5160928  393732      7157    32503     3.0   
2  tt3498820  271110      1398    20241     3.0   
3  tt3040964  278927      2625    42975     5.0   
4  tt5308322  440021      8457    31336     3.0   

                               title  \
0                  Inside Out (2015)   
1                 Mean Dreams (2017)   
2  Captain America: Civil War (2016)   
3             The Jungle Book (2016)   
4             Happy Death Day (2017)   

                                              genres  \
0  Adventure|Animation|Children|Comedy|Drama|Fantasy   
1                                           Thriller   
2                             Action|Sci-Fi|Thriller   
3                            Adventure|Drama|Fantasy   
4                            Horror|Mystery|Thriller   

                                             posters  
0  https://m.media-amazon.com/ima

In [7]:
print("Steam Games Dataset Preview:")
print(steam_games_df.head())

Steam Games Dataset Preview:
     AppID                   Name  Release date Estimated owners  Peak CCU  \
0    20200       Galactic Bowling  Oct 21, 2008        0 - 20000         0   
1   655370           Train Bandit  Oct 12, 2017        0 - 20000         0   
2  1732930           Jolt Project  Nov 17, 2021        0 - 20000         0   
3  1355720               Henosis™  Jul 23, 2020        0 - 20000         0   
4  1139950  Two Weeks in Painland   Feb 3, 2020        0 - 20000         0   

   Required age  Price  DLC count  \
0             0  19.99          0   
1             0   0.99          0   
2             0   4.99          0   
3             0   5.99          0   
4             0   0.00          0   

                                      About the game  \
0  Galactic Bowling is an exaggerated and stylize...   
1  THE LAW!! Looks to be a showdown atop a train....   
2  Jolt Project: The army now has a new robotics ...   
3  HENOSIS™ is a mysterious 2D Platform Puzzler w...   

In [20]:
movie_lens_df['year'] = movie_lens_df['title'].str.extract(r'\((\d{4})\)')
default_year = 0
movie_lens_df['year'] = movie_lens_df['year'].fillna(default_year)
movie_lens_df['title'] = movie_lens_df['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)

In [21]:
movie_lens_df.head()

Unnamed: 0,imdbId,tmdbId,movie_id,user_id,rating,title,genres,posters,year
0,tt2096673,150540,2307,11923,3.5,Inside Out,Adventure|Animation|Children|Comedy|Drama|Fantasy,https://m.media-amazon.com/images/M/MV5BOTgxMD...,2015
1,tt5160928,393732,7157,32503,3.0,Mean Dreams,Thriller,https://m.media-amazon.com/images/M/MV5BMDM2OD...,2017
2,tt3498820,271110,1398,20241,3.0,Captain America: Civil War,Action|Sci-Fi|Thriller,https://m.media-amazon.com/images/M/MV5BMjQ0MT...,2016
3,tt3040964,278927,2625,42975,5.0,The Jungle Book,Adventure|Drama|Fantasy,https://m.media-amazon.com/images/M/MV5BMTc3NT...,2016
4,tt5308322,440021,8457,31336,3.0,Happy Death Day,Horror|Mystery|Thriller,https://m.media-amazon.com/images/M/MV5BYzZhY2...,2017


In [22]:
steam_games_df.columns

Index(['Name', 'Release date', 'Price', 'Genres', 'Tags', 'Reviews',
       'User score', 'Movies', 'Metacritic score', 'Categories',
       'About the game'],
      dtype='object')

In [23]:
steam_games_df.head()

Unnamed: 0,Name,Release date,Price,Genres,Tags,Reviews,User score,Movies,Metacritic score,Categories,About the game
0,Galactic Bowling,"Oct 21, 2008",19.99,"Casual,Indie,Sports","Indie,Casual,Sports,Bowling",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Multi-player,Steam Achievements,...",Galactic Bowling is an exaggerated and stylize...
1,Train Bandit,"Oct 12, 2017",0.99,"Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements,Full controll...",THE LAW!! Looks to be a showdown atop a train....
2,Jolt Project,"Nov 17, 2021",4.99,"Action,Adventure,Indie,Strategy",,,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,Single-player,Jolt Project: The army now has a new robotics ...
3,Henosis™,"Jul 23, 2020",5.99,"Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Full controller support",HENOSIS™ is a mysterious 2D Platform Puzzler w...
4,Two Weeks in Painland,"Feb 3, 2020",0.0,"Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",ABOUT THE GAME Play as a hacker who has arrang...


In [24]:
games_relevent_columns = [
    'Name', 'Release date', 'Price', 'Genres', 'Tags', 'Reviews', 'User score', 
    'Movies', 'Metacritic score', 'Categories', 'About the game'
]
steam_games_df = steam_games_df[games_relevent_columns]

In [27]:
steam_games_df

Unnamed: 0,Name,Release date,Price,Genres,Tags,Reviews,User score,Movies,Metacritic score,Categories,About the game
0,Galactic Bowling,"Oct 21, 2008",19.99,"Casual,Indie,Sports","Indie,Casual,Sports,Bowling",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Multi-player,Steam Achievements,...",Galactic Bowling is an exaggerated and stylize...
1,Train Bandit,"Oct 12, 2017",0.99,"Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements,Full controll...",THE LAW!! Looks to be a showdown atop a train....
2,Jolt Project,"Nov 17, 2021",4.99,"Action,Adventure,Indie,Strategy",,,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,Single-player,Jolt Project: The army now has a new robotics ...
3,Henosis™,"Jul 23, 2020",5.99,"Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Full controller support",HENOSIS™ is a mysterious 2D Platform Puzzler w...
4,Two Weeks in Painland,"Feb 3, 2020",0.00,"Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",ABOUT THE GAME Play as a hacker who has arrang...
...,...,...,...,...,...,...,...,...,...,...,...
83555,Sex Simulator - Office Promotion,"Sep 16, 2023",3.99,"Adventure,Casual,Indie",,,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements,Steam Leaderb...",ABOUT THE GAME A sex simulation game with visu...
83556,UFindO,"Aug 21, 2023",0.49,"Casual,Indie","Casual,Puzzle,Hidden Object,Creature Collector...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",Are you ready to find the difference? Then qui...
83557,UNDRESS!,"Oct 23, 2023",1.99,Casual,,,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",UNDRESS! - is an exciting game in which the pl...
83558,Oculant,"Nov 22, 2023",4.99,"Action,Adventure,Casual",,,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,Single-player,Story: The world is shrouded with the eclipse ...


In [28]:
steam_games_df['Release Year'] = pd.to_datetime(steam_games_df['Release date'], errors='coerce').dt.year

In [30]:
steam_games_df['Release date'] = steam_games_df['Release date'].apply(
    lambda x: '01 ' + x if len(x.split()) == 2 else x
)

# Now, convert to datetime and extract the year
steam_games_df['Release Year'] = pd.to_datetime(steam_games_df['Release date'], errors='coerce').dt.year

In [33]:
steam_games_df.isna().sum()

Name                    6
Release date            0
Price                   0
Genres               3425
Tags                19986
Reviews             73844
User score              0
Movies               6300
Metacritic score        0
Categories           4456
About the game       3437
Release Year          130
dtype: int64

In [36]:
steam_games_df.loc[steam_games_df["Name"].str.contains('Playtest', na=False), 'About the game'] = "This is a playtest game, which is the process by which a game designer tests a new game for bugs and design flaws before releasing it to market."

In [37]:
steam_games_df.isna().sum()

Name                    6
Release date            0
Price                   0
Genres               3425
Tags                19986
Reviews             73844
User score              0
Movies               6300
Metacritic score        0
Categories           4456
About the game        191
Release Year          130
dtype: int64

In [45]:
steam_games_df = steam_games_df.drop(steam_games_df.loc[steam_games_df['Name'].isna()].index)

In [46]:
for index, row in steam_games_df.iterrows():
    if pd.isnull(row['About the game']):
        if pd.notnull(row['Name']):  # Check if 'Name' is not null
            if 'Beta' in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This game is in beta and still under testing.'
            elif "Alpha" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This game is in alpha and still under testing.'
            elif "beta" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This game is in beta and still under testing.'
            elif "BETA" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This game is in beta and still under testing.'
            elif "Test" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This game is still under testing.'
            elif "playtest" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This game is still under playtest.'
            elif "SDK" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'Software Development Kit of the game.'
            elif "Demo" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This is a demo version of the game.'
            elif "Server" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This is a server for the game.'
            elif "Editor" in row['Name']:
                steam_games_df.at[index, 'About the game'] = 'This is an editor for the game.'
            else:
                steam_games_df.at[index, 'About the game'] = 'This game does not have a description.'

In [48]:
#after exploring the Categories column i noticed some key words that are repeating i delt with every case then i left up with
# 1009 game that the Publisher is not mentioned

# for any beta game the null value sups with 'Beta game not playable'
# for any Alpha game the null value sups with 'Alpha  game not playable'
# for any Test game the null value sups with 'this game game not playable'
# for any playtest game the null value sups with 'Playtest game not playable'
# for any SDK game the null value sups with 'Software Development Kit of the game not playable'
# for any Demo game the null value sups with 'Demo game not playable'
# for any server game the null value sups with 'Server of a game not playable'
# for any editor game the null value sups with 'Beta game not playable'
for index, row in steam_games_df.iterrows():
    if(pd.isnull(row['Categories'])):
        if 'Playtest' in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Playtest game not playable'
        elif "Alpha" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Alpha game not playable'
        elif "beta" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Beta game not playable'
        elif "BETA" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Beta game not playable'
        elif "Test" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'test game not playable'
        elif "playtest" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Playtest game not playable'
        elif "SDK" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Software Development Kit of the game not playable'
        elif "Demo" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Demo game not playable'
        elif "Server" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Server of a game not playable'
        elif "Editor" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Editor of a game not playable'
        elif "Beta" in row['Name']:
            steam_games_df.at[index, 'Categories'] = 'Beta game not playable'
        else:
            continue

In [49]:
steam_games_df.isna().sum()

Name                    0
Release date            0
Price                   0
Genres               3423
Tags                19980
Reviews             73838
User score              0
Movies               6295
Metacritic score        0
Categories           1149
About the game          0
Release Year          130
dtype: int64

In [50]:
for index, row in steam_games_df.iterrows():
    if(pd.isnull(row['Categories'])):
        steam_games_df.at[index, 'Categories'] = 'no Category added'

In [51]:
steam_games_df.isna().sum()

Name                    0
Release date            0
Price                   0
Genres               3423
Tags                19980
Reviews             73838
User score              0
Movies               6295
Metacritic score        0
Categories              0
About the game          0
Release Year          130
dtype: int64

In [52]:
for index, row in steam_games_df.iterrows():
    if(pd.isnull(row['Genres'])):
        if 'Playtest' in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Playtest game not playable'
        elif "Alpha" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Alpha game not playable'
        elif "beta" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Beta game not playable'
        elif "BETA" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Beta game not playable'
        elif "Test" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'test game not playable'
        elif "playtest" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Playtest game not playable'
        elif "SDK" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Software Development Kit of the game not playable'
        elif "Demo" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Demo game not playable'
        elif "Server" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Server of a game not playable'
        elif "Editor" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Editor of a game not playable'
        elif "Beta" in row['Name']:
            steam_games_df.at[index, 'Genres'] = 'Beta game not playable'
        else:
            continue


NameError: name 'df' is not defined

In [53]:
for index, row in steam_games_df.iterrows():
    if(pd.isnull(row['Genres'])):
        steam_games_df.at[index, 'Genres'] = 'no Genres added'

In [54]:
for index, row in steam_games_df.iterrows():
    if(pd.isnull(row['Tags'])):
        steam_games_df.at[index, 'Tags'] = steam_games_df.at[index, 'Genres']

In [55]:
for index, row in steam_games_df.iterrows():
    if(pd.isnull(row['Movies'])):
        if 'Playtest' in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Playtest game no trailer available'
        elif "Alpha" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Alpha game no trailer available'
        elif "beta" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Beta game no trailer available'
        elif "BETA" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Beta game no trailer available'
        elif "Test" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'test game no trailer available'
        elif "playtest" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Playtest game no trailer available'
        elif "SDK" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Software Development Kit of the game no trailer available'
        elif "Demo" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Demo game no trailer available'
        elif "Server" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Server of a game no trailer available'
        elif "Editor" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Editor of a game no trailer available'
        elif "Beta" in row['Name']:
            steam_games_df.at[index, 'Movies'] = 'Beta game no trailer available'
        else:
            steam_games_df.at[index, 'Movies'] = 'no trailer available'

In [56]:
steam_games_df.isna().sum()

Name                    0
Release date            0
Price                   0
Genres                  0
Tags                    0
Reviews             73838
User score              0
Movies                  0
Metacritic score        0
Categories              0
About the game          0
Release Year          130
dtype: int64

In [57]:
steam_games_df['Release Year'] = steam_games_df['Release Year'].fillna(default_year)

In [62]:
steam_games_df = steam_games_df.rename(columns={'Release Year': 'year'})


In [69]:
steam_games_df['year'] = pd.to_datetime(steam_games_df['Release date'], errors='coerce').dt.year

In [70]:
steam_games_df

Unnamed: 0,Name,Release date,Price,Genres,Tags,Reviews,User score,Movies,Metacritic score,Categories,About the game,year
0,Galactic Bowling,"Oct 21, 2008",19.99,"Casual,Indie,Sports","Indie,Casual,Sports,Bowling",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Multi-player,Steam Achievements,...",Galactic Bowling is an exaggerated and stylize...,2008.0
1,Train Bandit,"Oct 12, 2017",0.99,"Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements,Full controll...",THE LAW!! Looks to be a showdown atop a train....,2017.0
2,Jolt Project,"Nov 17, 2021",4.99,"Action,Adventure,Indie,Strategy","Action,Adventure,Indie,Strategy",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,Single-player,Jolt Project: The army now has a new robotics ...,2021.0
3,Henosis™,"Jul 23, 2020",5.99,"Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Full controller support",HENOSIS™ is a mysterious 2D Platform Puzzler w...,2020.0
4,Two Weeks in Painland,"Feb 3, 2020",0.00,"Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",ABOUT THE GAME Play as a hacker who has arrang...,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...
83555,Sex Simulator - Office Promotion,"Sep 16, 2023",3.99,"Adventure,Casual,Indie","Adventure,Casual,Indie",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements,Steam Leaderb...",ABOUT THE GAME A sex simulation game with visu...,2023.0
83556,UFindO,"Aug 21, 2023",0.49,"Casual,Indie","Casual,Puzzle,Hidden Object,Creature Collector...",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",Are you ready to find the difference? Then qui...,2023.0
83557,UNDRESS!,"Oct 23, 2023",1.99,Casual,Casual,,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,"Single-player,Steam Achievements",UNDRESS! - is an exciting game in which the pl...,2023.0
83558,Oculant,"Nov 22, 2023",4.99,"Action,Adventure,Casual","Action,Adventure,Casual",,0,http://cdn.akamai.steamstatic.com/steam/apps/2...,0,Single-player,Story: The world is shrouded with the eclipse ...,2023.0


In [73]:
movie_lens_df.to_csv('../data/processed/processed_movies.csv', index=False)
steam_games_df.to_csv('../data/processed/processed_games.csv', index=False)
