In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('Raw Data/GAME DATA.csv')

# Clean Data

## Count nulls per column

In [4]:
data.shape

(65111, 14)

In [5]:
data.isna().sum()

App ID                     0
Title                      0
Reviews Total              0
Reviews Score Fancy        0
Release Date               0
Reviews D7             65111
Reviews D30            65111
Reviews D90            65111
Launch Price               0
Tags                       0
name_slug              65111
Revenue Estimated          0
Modified Tags              0
Steam Page                 0
dtype: int64

## Drop empty columns

In [6]:
data = data.drop(["Reviews D7","Reviews D30","Reviews D90","name_slug", "Steam Page"], axis=1)

In [7]:
data.columns

Index(['App ID', 'Title', 'Reviews Total', 'Reviews Score Fancy',
       'Release Date', 'Launch Price', 'Tags', 'Revenue Estimated',
       'Modified Tags'],
      dtype='object')

# Clean column names

In [8]:
data.rename(columns= {'App ID': "app_id", 
                    'Title': "title", 
                    'Reviews Total': "reviews_total", 
                    'Reviews Score Fancy': "reviews_score_fancy",
                    'Release Date': "release_date", 
                    'Launch Price': "launch_price_fancy", 
                    'Tags' : 'tags',
                    'Revenue Estimated': "revenue_estimated_dataset",
                    'Modified Tags': "modified_tags"
}, inplace=True)


## Drop columns with not enough review data

### Review threshold justification:

The dataset will be joined with games that have reached top 200 twtich viewership since 2016. 

Games will less than 500 reviews are not expected to have reached top 200 twitch viewrship

In [9]:
review_threshold = 500
drop_lt_5000_reviews = data[data['reviews_total'] >= review_threshold]
print(drop_lt_5000_reviews.shape)
drop_lt_5000_reviews.tail(3)

(6557, 9)


Unnamed: 0,app_id,title,reviews_total,reviews_score_fancy,release_date,launch_price_fancy,tags,revenue_estimated_dataset,modified_tags
6554,403950,Conquest of Elysium 4,500,89%,2015-11-16,"$24,99","Strategy, Indie, Turn Based, Fantasy, Turn Bas...","$12 495,00","Strategy_, Indie_, Turn Based_, Fantasy_, Turn..."
6555,1604380,Hamidashi Creative,500,97%,2022-09-30,"$29,99","Adventure, Casual, Visual Novel, Sexual Conten...","$14 995,00","Adventure_, Casual_, Visual Novel_, Sexual Con..."
6556,567670,Serious Sam 3 VR: BFE,500,86%,2017-11-09,"$39,99","Action, Indie, VR, Gore, FPS, First Person","$19 995,00","Action_, Indie_, VR_, Gore_, FPS_, First Person_"


## Clean column datatypes

### clean purchase price to numeric in cents

In [10]:
drop_lt_5000_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6557 entries, 0 to 6556
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   app_id                     6557 non-null   int64 
 1   title                      6557 non-null   object
 2   reviews_total              6557 non-null   int64 
 3   reviews_score_fancy        6557 non-null   object
 4   release_date               6557 non-null   object
 5   launch_price_fancy         6557 non-null   object
 6   tags                       6557 non-null   object
 7   revenue_estimated_dataset  6557 non-null   object
 8   modified_tags              6557 non-null   object
dtypes: int64(2), object(7)
memory usage: 512.3+ KB


In [11]:
clean_numerics = drop_lt_5000_reviews.copy()

# strip all non-digit characters to get str representation of price in cents, then convert to int
clean_numerics['launch_price_cents'] = clean_numerics['launch_price_fancy'].str.replace(r'[\D]', '', regex=True).astype(int)

In [12]:
clean_numerics['dataset_est_rev_cents'] = clean_numerics['revenue_estimated_dataset'].str.replace(r'[\D]', '', regex=True).astype(int)


In [13]:
clean_numerics['review_avg_percent'] = clean_numerics['reviews_score_fancy'].str.replace(r'%', '').str.replace(r',', '.').astype(float)
clean_numerics[(clean_numerics['review_avg_percent'] > 100) | (clean_numerics['review_avg_percent'] < 0)]

Unnamed: 0,app_id,title,reviews_total,reviews_score_fancy,release_date,launch_price_fancy,tags,revenue_estimated_dataset,modified_tags,launch_price_cents,dataset_est_rev_cents,review_avg_percent


In [14]:
clean_numerics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6557 entries, 0 to 6556
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   app_id                     6557 non-null   int64  
 1   title                      6557 non-null   object 
 2   reviews_total              6557 non-null   int64  
 3   reviews_score_fancy        6557 non-null   object 
 4   release_date               6557 non-null   object 
 5   launch_price_fancy         6557 non-null   object 
 6   tags                       6557 non-null   object 
 7   revenue_estimated_dataset  6557 non-null   object 
 8   modified_tags              6557 non-null   object 
 9   launch_price_cents         6557 non-null   int64  
 10  dataset_est_rev_cents      6557 non-null   int64  
 11  review_avg_percent         6557 non-null   float64
dtypes: float64(1), int64(4), object(7)
memory usage: 665.9+ KB


### Clean launch date
date format is string "YYYY-MM-DD"

In [15]:
clean_dates = clean_numerics.copy()
clean_dates['release_date'] = pd.to_datetime(clean_dates['release_date'])
clean_dates.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6557 entries, 0 to 6556
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   app_id                     6557 non-null   int64         
 1   title                      6557 non-null   object        
 2   reviews_total              6557 non-null   int64         
 3   reviews_score_fancy        6557 non-null   object        
 4   release_date               6557 non-null   datetime64[ns]
 5   launch_price_fancy         6557 non-null   object        
 6   tags                       6557 non-null   object        
 7   revenue_estimated_dataset  6557 non-null   object        
 8   modified_tags              6557 non-null   object        
 9   launch_price_cents         6557 non-null   int64         
 10  dataset_est_rev_cents      6557 non-null   int64         
 11  review_avg_percent         6557 non-null   float64       
dtypes: datetime

In [16]:
clean_dates.tail(3)

Unnamed: 0,app_id,title,reviews_total,reviews_score_fancy,release_date,launch_price_fancy,tags,revenue_estimated_dataset,modified_tags,launch_price_cents,dataset_est_rev_cents,review_avg_percent
6554,403950,Conquest of Elysium 4,500,89%,2015-11-16,"$24,99","Strategy, Indie, Turn Based, Fantasy, Turn Bas...","$12 495,00","Strategy_, Indie_, Turn Based_, Fantasy_, Turn...",2499,1249500,89.0
6555,1604380,Hamidashi Creative,500,97%,2022-09-30,"$29,99","Adventure, Casual, Visual Novel, Sexual Conten...","$14 995,00","Adventure_, Casual_, Visual Novel_, Sexual Con...",2999,1499500,97.0
6556,567670,Serious Sam 3 VR: BFE,500,86%,2017-11-09,"$39,99","Action, Indie, VR, Gore, FPS, First Person","$19 995,00","Action_, Indie_, VR_, Gore_, FPS_, First Person_",3999,1999500,86.0


## Reorder columns and drop unneeded ones

In [17]:
clean_dates.columns

Index(['app_id', 'title', 'reviews_total', 'reviews_score_fancy',
       'release_date', 'launch_price_fancy', 'tags',
       'revenue_estimated_dataset', 'modified_tags', 'launch_price_cents',
       'dataset_est_rev_cents', 'review_avg_percent'],
      dtype='object')

In [18]:
clean_steam_df = clean_dates[['app_id', 'title', 'release_date', 'reviews_total', 'review_avg_percent', 'launch_price_cents',
       'dataset_est_rev_cents', 'tags', 'modified_tags']]

In [19]:
clean_steam_df.to_csv("Resources/clean_steam_dtypes.csv", index=False)