In [85]:
#Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor
# Load and inspect the data
games = pd.read_csv('GeneralEsportData.csv')
earning = pd.read_csv('HistoricalEsportData.csv')

print(games.head())
print(earning.head())

                    Game  ReleaseDate     Genre  TotalEarnings  \
0         Age of Empires         1997  Strategy      510716.46   
1      Age of Empires II         1999  Strategy     3253232.59   
2     Age of Empires III         2005  Strategy      115843.65   
3      Age of Empires IV         2021  Strategy      994675.03   
4  Age of Empires Online         2011  Strategy        9806.56   

   OfflineEarnings  PercentOffline  TotalPlayers  TotalTournaments  
0        375857.63        0.735942           473               265  
1       1139384.57        0.350232          1994              1613  
2         41800.00        0.360831           166               163  
3        409117.93        0.411308           542               327  
4           775.00        0.079029            44                50  
         Date                          Game  Earnings  Players  Tournaments
0  1998-01-01  Command & Conquer: Red Alert   15000.0        8            1
1  1998-01-01                    Quak

In [86]:
print(games.describe())

       ReleaseDate  TotalEarnings  OfflineEarnings  PercentOffline  \
count   650.000000   6.500000e+02     6.500000e+02      583.000000   
mean   2012.576923   2.527759e+06     1.774991e+06        0.659145   
std       7.436703   1.758171e+07     1.377120e+07        0.412233   
min    1981.000000   0.000000e+00     0.000000e+00        0.000000   
25%    2007.000000   1.000000e+03     0.000000e+00        0.219244   
50%    2015.000000   3.636129e+04     1.500000e+04        0.906896   
75%    2019.000000   2.563600e+05     1.715892e+05        1.000000   
max    2023.000000   3.371080e+08     2.982906e+08        1.000000   

       TotalPlayers  TotalTournaments  
count    650.000000        650.000000  
mean     226.541538         94.078462  
std      964.342887        479.807202  
min        0.000000          0.000000  
25%        4.000000          1.000000  
50%       22.000000          5.000000  
75%       99.500000         27.750000  
max    16272.000000       7245.000000  


In [87]:
games.isnull().sum()

Game                 0
ReleaseDate          0
Genre                0
TotalEarnings        0
OfflineEarnings      0
PercentOffline      67
TotalPlayers         0
TotalTournaments     0
dtype: int64

In [88]:
# Adding a year column to the DataFrame
earning["Year"] = pd.DatetimeIndex(earning["Date"]).year
print(earning.head())

         Date                          Game  Earnings  Players  Tournaments  \
0  1998-01-01  Command & Conquer: Red Alert   15000.0        8            1   
1  1998-01-01                    QuakeWorld   15000.0        8            1   
2  1998-05-01                      Quake II   15000.0        4            1   
3  1998-05-01            Total Annihilation   15000.0        2            1   
4  1998-05-01                    QuakeWorld     300.0        3            1   

   Year  
0  1998  
1  1998  
2  1998  
3  1998  
4  1998  


In [553]:
# Checking the different Genres
print("The number of genres: " + str(games['Genre'].nunique()))
print(games['Genre'].unique())
print("The number of games: " + str(games['Game'].nunique()))
print(games['Game'].unique())
# Changing the names so it won't be too long 
games = games.apply(lambda x: x.replace({'Multiplayer Online Battle Arena':'MOBA', 'Third-Person Shooter':'TPS', "Role-Playing": 'RPG', "First-Person Shooter": "FPS"}, regex=True))

The number of genres: 12
['Strategy' 'FPS' 'MOBA' 'RPG Game' 'Fighting Game' 'Racing' 'Sports'
 'Collectible Card Game' 'Puzzle Game' 'Battle Royale' 'TPS'
 'Music / Rhythm Game']
The number of games: 650
['Age of Empires' 'Age of Empires II' 'Age of Empires III'
 'Age of Empires IV' 'Age of Empires Online' 'Age of Mythology' 'Among Us'
 'Auto Chess' 'Brawl Stars' 'Chess.com' 'chess24' 'chessarena.com'
 'Clash of Clans' 'Clash Royale' 'Clash Royale x Chess'
 'Command & Conquer 3' 'Command & Conquer: Red Alert'
 'Company of Heroes 2' 'ComPet' 'Dead By Daylight'
 'Dead by Daylight Mobile' 'Dota Underlords' 'Endgods'
 'Farming Simulator 19' 'Farming Simulator 22' 'Grey Goo'
 'Heroes of Might and Magic III' 'Identity V' 'Internet Chess Club'
 'Iron Harvest' 'Legion TD 2' 'Lichess' 'Loco Dojo Unleashed'
 'Men of War: Assault Squad 2' 'Multigaming' 'Playchess.com' 'Pokémon GO'
 'Pokémon VGC' 'Prophecy' 'Rise of Nations: Rise of Legends'
 'Sea of Thieves' 'Star Wars: Squadrons' 'StarCraft II'

In [554]:
games_earning = earning.merge(games[['Game','Genre','ReleaseDate','TotalPlayers',]], how = 'left', on = 'Game')
print(games_earning.head())

         Date                          Game  Earnings  Players  Tournaments  \
0  1998-01-01  Command & Conquer: Red Alert   15000.0        8            1   
1  1998-01-01                    QuakeWorld   15000.0        8            1   
2  1998-05-01                      Quake II   15000.0        4            1   
3  1998-05-01            Total Annihilation   15000.0        2            1   
4  1998-05-01                    QuakeWorld     300.0        3            1   

   Year     Genre  ReleaseDate  TotalPlayers  
0  1998  Strategy         1996             8  
1  1998       FPS         1996            59  
2  1998       FPS         1997            37  
3  1998  Strategy         1997             2  
4  1998       FPS         1996            59  


In [555]:
games_earning = games_earning.loc[(games_earning["Earnings"]>0) & (games_earning["Players"]>0) ]
replace_names_dict = {
    "PLAYERUNKNOWN'S BATTLEGROUNDS Mobile": 'PUBG mobile',
    "PLAYERUNKNOWN’S BATTLEGROUNDS": 'PUBG',
    "Counter-Strike: Global Offensive": 'Counter-Strike'
}
games_earning['Game'] = games_earning['Game'].replace(replace_names_dict)
val = ['Overwatch', 'Rainbow Six Siege', 'StarCraft II', "PUBG", 'PUBG mobile', 'Arena of Valor', 'League of Legends', 'Fortnite', 'Counter-Strike', 'Dota 2']
games_earning = games_earning[games_earning['Game'].isin(val)]
games_earning.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1161 entries, 52 to 9237
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          1161 non-null   object 
 1   Game          1161 non-null   object 
 2   Earnings      1161 non-null   float64
 3   Players       1161 non-null   int64  
 4   Tournaments   1161 non-null   int64  
 5   Year          1161 non-null   int32  
 6   Genre         1161 non-null   object 
 7   ReleaseDate   1161 non-null   int64  
 8   TotalPlayers  1161 non-null   int64  
dtypes: float64(1), int32(1), int64(4), object(3)
memory usage: 86.2+ KB


In [556]:
games_earning_pivot = pd.DataFrame()
games_earning_pivot = pd.pivot_table(games_earning, values = ['Game','Genre', 'Players','Tournaments', 'Earnings','TotalPlayers'],\
                                     index = ['Game','Year','Players', 'Earnings','Tournaments','TotalPlayers'], aggfunc = {'Genre': 'first',})
games_earning_pivot.reset_index(inplace=True)
games_earning_pivot = games_earning_pivot.loc[(games_earning_pivot["Earnings"]>100000) & (games_earning_pivot["Players"]>10)]
print(games_earning_pivot.head(10))
games_earning_pivot.info()

              Game  Year  Players    Earnings  Tournaments  TotalPlayers Genre
1   Arena of Valor  2016       25   268305.50            1          1986  MOBA
2   Arena of Valor  2017       40   116610.00            1          1986  MOBA
4   Arena of Valor  2017       66   403146.24            1          1986  MOBA
5   Arena of Valor  2017      100   530785.05            6          1986  MOBA
6   Arena of Valor  2017      129   722847.80            3          1986  MOBA
7   Arena of Valor  2017      133   411824.23            3          1986  MOBA
11  Arena of Valor  2018       61   137891.56            2          1986  MOBA
12  Arena of Valor  2018      100   222562.09            3          1986  MOBA
13  Arena of Valor  2018      112   295192.40            3          1986  MOBA
14  Arena of Valor  2018      116  2329160.00            3          1986  MOBA
<class 'pandas.core.frame.DataFrame'>
Index: 846 entries, 1 to 1159
Data columns (total 7 columns):
 #   Column        Non-Null Cou

In [557]:
features = ['Game','Year','Earnings', 'Genre','ReleaseDate','Tournaments','TotalPlayers']
X = games_earning_pivot.drop(['Players'], axis=1)
y = games_earning_pivot['Players']
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.25, random_state=37)
X.head()



Unnamed: 0,Game,Year,Earnings,Tournaments,TotalPlayers,Genre
1,Arena of Valor,2016,268305.5,1,1986,MOBA
2,Arena of Valor,2017,116610.0,1,1986,MOBA
4,Arena of Valor,2017,403146.24,1,1986,MOBA
5,Arena of Valor,2017,530785.05,6,1986,MOBA
6,Arena of Valor,2017,722847.8,3,1986,MOBA


In [558]:
non_numeric_cols = X_train.select_dtypes(include=['object']).columns
X_train = pd.get_dummies(X_train, columns=non_numeric_cols)

In [559]:
non_numeric_cols_test = X_test.select_dtypes(include=['object']).columns
X_test = pd.get_dummies(X_test, columns=non_numeric_cols)
X_test.describe()
X_test.info()


<class 'pandas.core.frame.DataFrame'>
Index: 212 entries, 279 to 483
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    212 non-null    int32  
 1   Earnings                212 non-null    float64
 2   Tournaments             212 non-null    int64  
 3   TotalPlayers            212 non-null    int64  
 4   Game_Arena of Valor     212 non-null    bool   
 5   Game_Counter-Strike     212 non-null    bool   
 6   Game_Dota 2             212 non-null    bool   
 7   Game_Fortnite           212 non-null    bool   
 8   Game_League of Legends  212 non-null    bool   
 9   Game_Overwatch          212 non-null    bool   
 10  Game_PUBG               212 non-null    bool   
 11  Game_PUBG mobile        212 non-null    bool   
 12  Game_Rainbow Six Siege  212 non-null    bool   
 13  Game_StarCraft II       212 non-null    bool   
 14  Genre_Battle Royale     212 non-null    bool 

In [560]:
# Tune hyperparameters, find the best set of hyperparameters

# Random Forest
# Crossvalidating to find the best hyperparameters
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=37)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs = -1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_



In [561]:
best_params

{'bootstrap': True,
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 100}

In [562]:
# Create and train the Random Forest model
zero_division = 1
rf_model = RandomForestClassifier(bootstrap=True,max_depth=None,min_samples_leaf=4,min_samples_split=10,n_estimators=200)
rf_model.fit(X_train, y_train.values.ravel())

# Make predictions on the test data
rf_predictions = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_report = classification_report(y_test, rf_predictions)
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:\n", rf_classification_report)

Accuracy: 0.018867924528301886
Classification Report:
               precision    recall  f1-score   support

          25       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         1
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         1
          49       0.00      0.00      0.00         1
          55       0.00      0.00      0.00         0
          56       0.00      0.00      0.00         0
          60       1.00      0.50      0.67         2
          62       0.00      0.00      0.00         1
          65       0.00      0.00      0.00         0
          66       0.00      0.00      0.00         1
          69       0.00      0.00      0.00         0
          72       0.00      0.00      0.00         1
          76       0.00      0.00      0.00         2
          77       0.17      1.00      0.29         1
          82       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [563]:
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score( y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.009433962264150943


idk :( 
best i could get is 0.05 
i tried changing target variables, prediction methods, but it didnt work
whatever i try, i assume it might be the dataset problem as the model can't determine true and false positives, therefore f1 score is 0 etc. 
I will visualize some graphs for the presentation tomorrow.