In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Raw Data/GAME DATA.csv')

# Clean Data

## Count nulls per column

In [3]:
data.shape

(65111, 14)

In [4]:
data.isna().sum()

App ID                     0
Title                      0
Reviews Total              0
Reviews Score Fancy        0
Release Date               0
Reviews D7             65111
Reviews D30            65111
Reviews D90            65111
Launch Price               0
Tags                       0
name_slug              65111
Revenue Estimated          0
Modified Tags              0
Steam Page                 0
dtype: int64

## Drop empty columns

In [5]:
data = data.drop(["Reviews D7","Reviews D30","Reviews D90","name_slug", "Steam Page"], axis=1)

In [6]:
data.columns

Index(['App ID', 'Title', 'Reviews Total', 'Reviews Score Fancy',
       'Release Date', 'Launch Price', 'Tags', 'Revenue Estimated',
       'Modified Tags'],
      dtype='object')

# Clean column names

In [7]:
data.rename(columns= {'App ID': "app_id", 
                    'Title': "title", 
                    'Reviews Total': "reviews_total", 
                    'Reviews Score Fancy': "reviews_score_fancy",
                    'Release Date': "release_date", 
                    'Launch Price': "launch_price_fancy", 
                    'Tags' : 'tags',
                    'Revenue Estimated': "revenue_estimated_dataset",
                    'Modified Tags': "modified_tags"
}, inplace=True)


## Drop columns with not enough review data

### Review threshold justification:

The dataset will be joined with games that have reached top 200 twtich viewership since 2016. 

Games will less than 500 reviews are not expected to have reached top 200 twitch viewrship

In [8]:
review_threshold = 500
drop_lt_5000_reviews = data[data['reviews_total'] >= review_threshold]
print(drop_lt_5000_reviews.shape)
drop_lt_5000_reviews.tail(3)

(6557, 9)


Unnamed: 0,app_id,title,reviews_total,reviews_score_fancy,release_date,launch_price_fancy,tags,revenue_estimated_dataset,modified_tags
6554,403950,Conquest of Elysium 4,500,89%,2015-11-16,"$24,99","Strategy, Indie, Turn Based, Fantasy, Turn Bas...","$12 495,00","Strategy_, Indie_, Turn Based_, Fantasy_, Turn..."
6555,1604380,Hamidashi Creative,500,97%,2022-09-30,"$29,99","Adventure, Casual, Visual Novel, Sexual Conten...","$14 995,00","Adventure_, Casual_, Visual Novel_, Sexual Con..."
6556,567670,Serious Sam 3 VR: BFE,500,86%,2017-11-09,"$39,99","Action, Indie, VR, Gore, FPS, First Person","$19 995,00","Action_, Indie_, VR_, Gore_, FPS_, First Person_"


## Clean column datatypes

### clean purchase price to numeric in cents

In [9]:
drop_lt_5000_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6557 entries, 0 to 6556
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   app_id                     6557 non-null   int64 
 1   title                      6557 non-null   object
 2   reviews_total              6557 non-null   int64 
 3   reviews_score_fancy        6557 non-null   object
 4   release_date               6557 non-null   object
 5   launch_price_fancy         6557 non-null   object
 6   tags                       6557 non-null   object
 7   revenue_estimated_dataset  6557 non-null   object
 8   modified_tags              6557 non-null   object
dtypes: int64(2), object(7)
memory usage: 512.3+ KB


In [10]:
clean_numerics = drop_lt_5000_reviews.copy()

# strip all non-digit characters to get str representation of price in cents, then convert to int
clean_numerics['launch_price_cents'] = clean_numerics['launch_price_fancy'].str.replace(r'[\D]', '', regex=True).astype(int)

In [11]:
clean_numerics['dataset_est_rev_cents'] = clean_numerics['revenue_estimated_dataset'].str.replace(r'[\D]', '', regex=True).astype(int)


In [12]:
clean_numerics['review_avg_percent'] = clean_numerics['reviews_score_fancy'].str.replace(r'%', '').str.replace(r',', '.').astype(float)
clean_numerics[(clean_numerics['review_avg_percent'] > 100) | (clean_numerics['review_avg_percent'] < 0)]

Unnamed: 0,app_id,title,reviews_total,reviews_score_fancy,release_date,launch_price_fancy,tags,revenue_estimated_dataset,modified_tags,launch_price_cents,dataset_est_rev_cents,review_avg_percent


In [13]:
clean_numerics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6557 entries, 0 to 6556
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   app_id                     6557 non-null   int64  
 1   title                      6557 non-null   object 
 2   reviews_total              6557 non-null   int64  
 3   reviews_score_fancy        6557 non-null   object 
 4   release_date               6557 non-null   object 
 5   launch_price_fancy         6557 non-null   object 
 6   tags                       6557 non-null   object 
 7   revenue_estimated_dataset  6557 non-null   object 
 8   modified_tags              6557 non-null   object 
 9   launch_price_cents         6557 non-null   int64  
 10  dataset_est_rev_cents      6557 non-null   int64  
 11  review_avg_percent         6557 non-null   float64
dtypes: float64(1), int64(4), object(7)
memory usage: 665.9+ KB


### Clean launch date
date format is string "YYYY-MM-DD"

In [14]:
clean_dates = clean_numerics.copy()
clean_dates['release_date'] = pd.to_datetime(clean_dates['release_date'])
clean_dates.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6557 entries, 0 to 6556
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   app_id                     6557 non-null   int64         
 1   title                      6557 non-null   object        
 2   reviews_total              6557 non-null   int64         
 3   reviews_score_fancy        6557 non-null   object        
 4   release_date               6557 non-null   datetime64[ns]
 5   launch_price_fancy         6557 non-null   object        
 6   tags                       6557 non-null   object        
 7   revenue_estimated_dataset  6557 non-null   object        
 8   modified_tags              6557 non-null   object        
 9   launch_price_cents         6557 non-null   int64         
 10  dataset_est_rev_cents      6557 non-null   int64         
 11  review_avg_percent         6557 non-null   float64       
dtypes: datetime

In [15]:
clean_dates.tail(3)

Unnamed: 0,app_id,title,reviews_total,reviews_score_fancy,release_date,launch_price_fancy,tags,revenue_estimated_dataset,modified_tags,launch_price_cents,dataset_est_rev_cents,review_avg_percent
6554,403950,Conquest of Elysium 4,500,89%,2015-11-16,"$24,99","Strategy, Indie, Turn Based, Fantasy, Turn Bas...","$12 495,00","Strategy_, Indie_, Turn Based_, Fantasy_, Turn...",2499,1249500,89.0
6555,1604380,Hamidashi Creative,500,97%,2022-09-30,"$29,99","Adventure, Casual, Visual Novel, Sexual Conten...","$14 995,00","Adventure_, Casual_, Visual Novel_, Sexual Con...",2999,1499500,97.0
6556,567670,Serious Sam 3 VR: BFE,500,86%,2017-11-09,"$39,99","Action, Indie, VR, Gore, FPS, First Person","$19 995,00","Action_, Indie_, VR_, Gore_, FPS_, First Person_",3999,1999500,86.0


In [16]:
clean_steam_df = clean_dates.copy()

# Make 1 to many tag csv

app_id foriegn key from clean_steam_data to many tags 

In [17]:
tags_df = clean_steam_df[["app_id", 'tags']]
tags_df

Unnamed: 0,app_id,tags
0,730,"FPS, Shooter, Multiplayer, Competitive, Action..."
1,578080,"Survival, Shooter, Battle Royale, Multiplayer,..."
2,570,"Free to Play, MOBA, Multiplayer, Strategy, eSp..."
3,271590,"Open World, Action, Multiplayer, Crime, Automo..."
4,359550,"FPS, PvP, eSports, Shooter, Multiplayer, Tacti..."
...,...,...
6552,1175360,"Twin Stick Shooter, Difficult, Action Roguelik..."
6553,630310,"Action, Adventure, Metroidvania, Pixel Graphic..."
6554,403950,"Strategy, Indie, Turn Based, Fantasy, Turn Bas..."
6555,1604380,"Adventure, Casual, Visual Novel, Sexual Conten..."


## Expand tags list to one to many (app_id : individual tags)

In [18]:
tags_df['tag'] = tags_df['tags'].str.split(',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags_df['tag'] = tags_df['tags'].str.split(',')


In [19]:
one_to_messy = tags_df.explode('tag').reset_index(drop=True)
one_to_messy

Unnamed: 0,app_id,tags,tag
0,730,"FPS, Shooter, Multiplayer, Competitive, Action...",FPS
1,730,"FPS, Shooter, Multiplayer, Competitive, Action...",Shooter
2,730,"FPS, Shooter, Multiplayer, Competitive, Action...",Multiplayer
3,730,"FPS, Shooter, Multiplayer, Competitive, Action...",Competitive
4,730,"FPS, Shooter, Multiplayer, Competitive, Action...",Action
...,...,...,...
115624,567670,"Action, Indie, VR, Gore, FPS, First Person",Indie
115625,567670,"Action, Indie, VR, Gore, FPS, First Person",VR
115626,567670,"Action, Indie, VR, Gore, FPS, First Person",Gore
115627,567670,"Action, Indie, VR, Gore, FPS, First Person",FPS


In [20]:
one_to_many_df = one_to_messy[['app_id', 'tag']]
one_to_many_df

Unnamed: 0,app_id,tag
0,730,FPS
1,730,Shooter
2,730,Multiplayer
3,730,Competitive
4,730,Action
...,...,...
115624,567670,Indie
115625,567670,VR
115626,567670,Gore
115627,567670,FPS


## How many unique tags exist? 744

In [21]:
num_unq_tags = one_to_many_df['tag'].unique().shape[0]
num_unq_tags

744

## Number of games per tag

In [22]:
one_to_many_df['tag'].value_counts()

tag
 Singleplayer    5171
 Indie           3173
 Adventure       3107
 Action          2950
 Multiplayer     2271
                 ... 
Top Down            1
Short               1
Supernatural        1
Rome                1
Minigames           1
Name: count, Length: 744, dtype: int64

# 247 tags are present in >= 10% of the sample

In [23]:
(one_to_many_df.value_counts('tag') >= 74).sum()

np.int64(247)

## Filter tags df to only tags present in 10% of sample

In [24]:
common_tags = one_to_many_df[one_to_many_df.groupby('tag')['tag'].transform('size') >= (num_unq_tags // 10)]
common_tags['tag'].value_counts()

tag
 Singleplayer    5171
 Indie           3173
 Adventure       3107
 Action          2950
 Multiplayer     2271
                 ... 
Open World         78
 Space Sim         78
 Farming Sim       77
 Trading           74
 Modern            74
Name: count, Length: 247, dtype: int64

In [25]:
common_tags.info()

<class 'pandas.core.frame.DataFrame'>
Index: 106866 entries, 0 to 115628
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   app_id  106866 non-null  int64 
 1   tag     106866 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


### Clean trailing and leading whitespace in tags

In [26]:
common_tags['tag'] = common_tags['tag'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_tags['tag'] = common_tags['tag'].str.strip()


In [27]:
common_tags['tag'].unique()

array(['FPS', 'Shooter', 'Multiplayer', 'Competitive', 'Action',
       'Team Based', 'eSports', 'Tactical', 'First Person', 'PvP',
       'Online Co Op', 'Co op', 'Strategy', 'Military', 'War',
       'Difficult', 'Trading', 'Realistic', 'Fast Paced', 'Moddable',
       'Survival', 'Third Person Shooter', 'Early Access', 'Third Person',
       'Simulation', 'Stealth', 'RTS', 'Tower Defense', 'RPG', 'Fantasy',
       'Character Customization', 'Replay Value', 'Action RPG',
       'Open World', 'Crime', 'Automobile Sim', 'Mature', 'Adventure',
       'Singleplayer', 'Racing', 'Atmospheric', 'Sandbox', 'Funny',
       'Great Soundtrack', 'Comedy', 'Destruction', '3D', '2D',
       'Pixel Graphics', 'Crafting', 'Building', 'Exploration', 'Indie',
       'Platformer', 'Physics', 'Massively Multiplayer',
       'Open World Survival Craft', 'Nudity', 'Post apocalyptic',
       'Story Rich', 'Choices Matter', 'Medieval', 'Multiple Endings',
       'Magic', 'Dark Fantasy', 'Dark', 'Space', 'Ps

# Save common_tags DF as app_id_common_tags and clean_steam_data df

## Reorder columns and drop unneeded ones

In [28]:
clean_dates.columns

Index(['app_id', 'title', 'reviews_total', 'reviews_score_fancy',
       'release_date', 'launch_price_fancy', 'tags',
       'revenue_estimated_dataset', 'modified_tags', 'launch_price_cents',
       'dataset_est_rev_cents', 'review_avg_percent'],
      dtype='object')

In [29]:
clean_steam_df = clean_dates[['app_id', 'title', 'release_date', 'reviews_total', 'review_avg_percent', 'launch_price_cents',
       'dataset_est_rev_cents']]
clean_steam_df.tail(3)

Unnamed: 0,app_id,title,release_date,reviews_total,review_avg_percent,launch_price_cents,dataset_est_rev_cents
6554,403950,Conquest of Elysium 4,2015-11-16,500,89.0,2499,1249500
6555,1604380,Hamidashi Creative,2022-09-30,500,97.0,2999,1499500
6556,567670,Serious Sam 3 VR: BFE,2017-11-09,500,86.0,3999,1999500


In [30]:
common_tags.tail(3)

Unnamed: 0,app_id,tag
115626,567670,Gore
115627,567670,FPS
115628,567670,First Person


In [31]:
clean_steam_df.to_csv("Resources/clean_steam_dtypes.csv", index=False)

In [32]:
common_tags.to_csv("Resources/app_id_common_tags.csv", index=False)