In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import charset_normalizer as charn

In [2]:
from charset_normalizer import from_path
csv_path = 'Resources/Twitch_game_data_fixed_names.csv'

data = from_path(csv_path).best()
encoding = data.encoding_aliases[1]
twitch_df = pd.read_csv(csv_path, encoding=encoding)
print(twitch_df.shape)
twitch_df.head(3)

(18600, 12)


Unnamed: 0,Rank,Game,Month,Year,Hours Watched,Hours Streamed,Peak Viewers,Peak Channels,Streamers,Average Viewers,Average Channels,Average Viewer Ratio
0,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,1833,69.29
1,2,Counter-Strike: Global Offensive,1,2016,47832863,830105,372654,2197,120849,64378,1117,57.62
2,3,Dota 2,1,2016,45185893,433397,315083,1100,44074,60815,583,104.26


In [3]:
# Determine number of rows in csv file
num_rows = len(twitch_df)
print("Number of rows:", num_rows)

Number of rows: 18600


In [4]:
# Determine data types in each column
twitch_df.dtypes

Rank                      int64
Game                     object
Month                     int64
Year                      int64
Hours Watched             int64
Hours Streamed            int64
Peak Viewers              int64
Peak Channels             int64
Streamers                 int64
Average Viewers           int64
Average Channels          int64
Average Viewer Ratio    float64
dtype: object

In [5]:
# Rename "Game" column to "Title", so to match GAME DATE.csv
twitch_df.rename(columns={'Game': "Title"}, inplace=True)
twitch_df.head()

Unnamed: 0,Rank,Title,Month,Year,Hours Watched,Hours Streamed,Peak Viewers,Peak Channels,Streamers,Average Viewers,Average Channels,Average Viewer Ratio
0,1,League of Legends,1,2016,94377226,1362044,530270,2903,129172,127021,1833,69.29
1,2,Counter-Strike: Global Offensive,1,2016,47832863,830105,372654,2197,120849,64378,1117,57.62
2,3,Dota 2,1,2016,45185893,433397,315083,1100,44074,60815,583,104.26
3,4,Hearthstone,1,2016,39936159,235903,131357,517,36170,53749,317,169.29
4,5,Call of Duty: Black Ops III,1,2016,16153057,1151578,71639,3620,214054,21740,1549,14.03


In [6]:
# Select 'Title' columen and identify the number of unqiue Game Titles.
unique_title = twitch_df['Title'].nunique()
print(f'The number of unqiue game titles that are listed in this csv file: {unique_title}')


The number of unqiue game titles that are listed in this csv file: 2160


In [7]:
# Sort data by 'Title' and 'Year'
twitch_df_sorted = twitch_df.sort_values(by=['Title', 'Year'])
twitch_df_sorted.head(10)

Unnamed: 0,Rank,Title,Month,Year,Hours Watched,Hours Streamed,Peak Viewers,Peak Channels,Streamers,Average Viewers,Average Channels,Average Viewer Ratio
4598,199,.hack//G.U. Last Recode,11,2017,145350,35258,1222,258,3174,202,49,4.12
15558,159,20 Minutes Till Dawn,6,2022,911356,12253,29743,80,3616,1267,17,74.38
6508,109,60 Parsecs!,9,2018,529688,1867,31960,27,606,736,2,283.71
1325,126,60 Seconds!,7,2016,268754,597,32505,10,275,361,0,450.17
1453,54,60 Seconds!,8,2016,772786,2065,56904,30,628,1040,2,374.23
2198,199,60 Seconds!,11,2016,109473,503,5297,5,193,152,0,217.64
2918,119,60 Seconds!,3,2017,263482,679,31311,10,240,354,0,388.04
4581,182,60 Seconds!,11,2017,179250,584,10458,8,228,249,0,306.93
4785,186,60 Seconds!,12,2017,180788,669,23476,9,260,243,0,270.24
6393,194,60 Seconds!,8,2018,222209,871,18245,8,414,299,1,255.12


In [8]:
twitch_df = twitch_df.drop(columns=[])

In [9]:
# Group by 'Title' and calculate average values for the relevant columns
twitch_df_grouped = twitch_df.groupby('Title').agg({
    'Hours Watched': 'mean',         # Average hours watched for each title
    'Hours Streamed': 'mean',        # Average hours streamed for each title
    'Streamers': 'mean',            # Average number of streamers for each title
    'Average Viewers': 'mean',      # Average number of average viewers for each title
    'Average Channels': 'mean'      # Average number of average channels for each title
}).reset_index()

# Rename headers to match format of 'clean_steaam_dtypes.csv' file
twitch_df_grouped.rename(columns={
    'Title': 'title',
    'Hours Watched': 'avg_hours_watched',
    'Hours Streamed': 'avg_hours_streamed',
    'Streamers': 'avg_streamers',
    'Average Viewers': 'avg_viewers',
    'Average Channels': 'avg_channels'
}, inplace= True)

# Round down all average values using np.floor
twitch_df_grouped['avg_hours_watched'] = np.floor(twitch_df_grouped['avg_hours_watched']).astype(int)
twitch_df_grouped['avg_hours_streamed'] = np.floor(twitch_df_grouped['avg_hours_streamed']).astype(int)
twitch_df_grouped['avg_streamers'] = np.floor(twitch_df_grouped['avg_streamers']).astype(int)
twitch_df_grouped['avg_viewers'] = np.floor(twitch_df_grouped['avg_viewers']).astype(int)
twitch_df_grouped['avg_channels'] = np.floor(twitch_df_grouped['avg_channels']).astype(int)

twitch_df_grouped.head(10)

Unnamed: 0,title,avg_hours_watched,avg_hours_streamed,avg_streamers,avg_viewers,avg_channels
0,.hack//G.U. Last Recode,145350,35258,3174,202,49
1,20 Minutes Till Dawn,911356,12253,3616,1267,17
2,60 Parsecs!,529688,1867,606,736,2
3,60 Seconds!,314900,1023,420,424,0
4,7 Days to Die,1023033,89486,8880,1397,122
5,A Dance of Fire and Ice,452164,7002,2833,645,9
6,A Hat in Time,331964,13813,2510,450,18
7,A Plague Tale: Innocence,1700679,36870,5592,2305,49
8,A Plague Tale: Requiem,2512917,96420,15296,3399,130
9,A Total War Saga: Troy,1019391,23357,3543,1371,31


In [10]:
# Read 'clean_system_dtypes.csv' file and create dataframe 
game_data_csv = pd.read_csv('Resources/clean_steam_dtypes.csv')
game_data_csv.head()

Unnamed: 0,app_id,title,release_date,reviews_total,review_avg_percent,launch_price_cents,dataset_est_rev_cents
0,730,Counter-Strike: Global Offensive,2012-08-21,7382695,88.0,1499,11066659805
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,2201296,57.0,2999,6601686704
2,570,Dota 2,2013-07-09,2017009,82.0,2999,6049009991
3,271590,Grand Theft Auto V,2015-04-13,1322782,89.85,2999,3967023218
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,978762,86.0,5999,5871593238


In [11]:
# Strip any leading/trailing whitespace from column names (just in case)
twitch_df_grouped.columns = twitch_df_grouped.columns.str.strip()
game_data_csv.columns = game_data_csv.columns.str.strip()

# Perform an inner join on the 'Title' column
merged_df = pd.merge(twitch_df_grouped, game_data_csv, on='title', how='inner')
merged_df.head()

Unnamed: 0,title,avg_hours_watched,avg_hours_streamed,avg_streamers,avg_viewers,avg_channels,app_id,release_date,reviews_total,review_avg_percent,launch_price_cents,dataset_est_rev_cents
0,.hack//G.U. Last Recode,145350,35258,3174,202,49,525480,2017-11-03,2689,87.0,4999,13442311
1,20 Minutes Till Dawn,911356,12253,3616,1267,17,1966900,2023-06-08,20942,92.0,499,10450058
2,60 Parsecs!,529688,1867,606,736,2,646270,2018-09-18,2315,89.0,999,2312685
3,60 Seconds!,314900,1023,420,424,0,368360,2015-05-25,9979,84.0,899,8971121
4,7 Days to Die,1023033,89486,8880,1397,122,251570,2013-12-13,185945,88.0,2499,464676555


In [12]:
# Save the merged DataFrame to a new CSV file in the 'Resources' folder
merged_df.to_csv('Resources/merge_output.csv', index= False)
print("Merged file saved successfully in 'Resources' folder.")

Merged file saved successfully in 'Resources' folder.


In [13]:
# Check merged_df 
merged_df.head()

Unnamed: 0,title,avg_hours_watched,avg_hours_streamed,avg_streamers,avg_viewers,avg_channels,app_id,release_date,reviews_total,review_avg_percent,launch_price_cents,dataset_est_rev_cents
0,.hack//G.U. Last Recode,145350,35258,3174,202,49,525480,2017-11-03,2689,87.0,4999,13442311
1,20 Minutes Till Dawn,911356,12253,3616,1267,17,1966900,2023-06-08,20942,92.0,499,10450058
2,60 Parsecs!,529688,1867,606,736,2,646270,2018-09-18,2315,89.0,999,2312685
3,60 Seconds!,314900,1023,420,424,0,368360,2015-05-25,9979,84.0,899,8971121
4,7 Days to Die,1023033,89486,8880,1397,122,251570,2013-12-13,185945,88.0,2499,464676555
