In [60]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn import model_selection, naive_bayes 
from sklearn.svm import SVC
from sklearn.tree import ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

## Section 1: Prediction using player ratings

### Section 1.1. Data Read

In [61]:
df_PlayerRatings = pd.read_csv(r'dataPlayerRatings.csv')
df_PlayerRatings.head(10)

Unnamed: 0,Team_0 (started DFD 1st H),Team_1 (started ATK 1st H),Bind,Haven,Split,Ascent,Icebox,Breeze,Team_0_P1,Team_0_P2,Team_0_P3,Team_0_P4,Team_0_P5,Team_1_P1,Team_1_P2,Team_1_P3,Team_1_P4,Team_1_P5,Winner
0,Kansas City Pioneers,Virtuoso,0,0,0,1,0,0,1.09,1.09,1.01,1.15,1.04,0.99,0.95,1.04,0.97,1.14,0
1,Virtuoso,Kansas City Pioneers,1,0,0,0,0,0,0.99,0.95,1.04,0.97,1.14,1.09,1.09,1.01,1.15,1.04,1
2,Rise,Cloud9 Blue,0,0,1,0,0,0,1.09,1.1,1.16,1.04,1.07,1.22,0.85,1.1,1.09,1.07,0
3,Cloud9 Blue,Rise,0,0,0,1,0,0,1.22,0.85,1.1,1.09,1.07,1.09,1.1,1.16,1.04,1.07,1
4,Luminosity Gaming,Version1,0,1,0,0,0,0,1.15,1.05,1.09,1.07,1.17,1.19,1.03,1.11,1.04,1.01,1
5,Version1,Luminosity Gaming,0,0,1,0,0,0,1.19,1.03,1.11,1.04,1.01,1.15,1.05,1.09,1.07,1.17,0
6,Andbox,GenG Esports,0,0,0,0,0,1,1.27,0.98,1.15,1.02,0.98,1.11,0.96,1.24,0.95,0.99,1
7,Andbox,GenG Esports,0,0,0,0,1,0,1.27,0.98,1.15,1.02,0.98,1.11,0.96,1.24,0.95,0.99,0
8,GenG Esports,Andbox,0,0,1,0,0,0,1.11,0.96,1.24,0.95,0.99,1.27,0.98,1.15,1.02,0.98,0
9,Noble,100 Thieves,0,1,0,0,0,0,1.19,1.09,0.98,1.06,0.9,1.26,1.08,1.08,1.05,0.91,1


### Section 1.2. Data Preprocessing using train_test_split()

In [62]:
X_train, X_test, y_train, y_test = train_test_split(df_PlayerRatings.drop(['Team_0 (started DFD 1st H)','Team_1 (started ATK 1st H)','Winner'],axis=1), df_PlayerRatings['Winner'], test_size=0.20, random_state=42)
y_train=y_train.astype('int')
y_test=y_test.astype('int')

### Section 1.3. Prediction results using train_test_split()

#### Logistic Regression

In [63]:
pipeline_LogReg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100)) 
pipeline_LogReg.fit(X_train, y_train)
pred_LogReg = pipeline_LogReg.predict(X_test)
print(classification_report(y_test,pred_LogReg))

              precision    recall  f1-score   support

           0       0.53      0.62      0.57        13
           1       0.71      0.63      0.67        19

    accuracy                           0.62        32
   macro avg       0.62      0.62      0.62        32
weighted avg       0.64      0.62      0.63        32



#### Stochastic Gradient Descent Classifier

In [64]:
pipeline_SGD = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, max_iter=1000))
pipeline_SGD.fit(X_train, y_train)
pred_SGD = pipeline_SGD.predict(X_test)
print(classification_report(y_test,pred_SGD))

              precision    recall  f1-score   support

           0       0.60      0.69      0.64        13
           1       0.76      0.68      0.72        19

    accuracy                           0.69        32
   macro avg       0.68      0.69      0.68        32
weighted avg       0.70      0.69      0.69        32



#### Support Vector Classifier (linear kernel)

In [65]:
pipeline_SVC = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='linear', degree=3, gamma='scale'))
pipeline_SVC.fit(X_train, y_train)
pred_SVC = pipeline_SVC.predict(X_test)
print(classification_report(y_test,pred_SVC))

              precision    recall  f1-score   support

           0       0.46      0.46      0.46        13
           1       0.63      0.63      0.63        19

    accuracy                           0.56        32
   macro avg       0.55      0.55      0.55        32
weighted avg       0.56      0.56      0.56        32



#### Random Forest Classifier

In [66]:
pipeline_RF = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=None))
pipeline_RF.fit(X_train, y_train)
pred_RF = pipeline_RF.predict(X_test)
print(classification_report(y_test,pred_RF))

              precision    recall  f1-score   support

           0       0.54      0.54      0.54        13
           1       0.68      0.68      0.68        19

    accuracy                           0.62        32
   macro avg       0.61      0.61      0.61        32
weighted avg       0.62      0.62      0.62        32



#### Extra Tree Classifier

In [67]:
pipeline_ET = make_pipeline(StandardScaler(), ExtraTreeClassifier())
pipeline_ET.fit(X_train, y_train)
pred_ET = pipeline_ET.predict(X_test)
print(classification_report(y_test,pred_ET))

              precision    recall  f1-score   support

           0       0.67      0.62      0.64        13
           1       0.75      0.79      0.77        19

    accuracy                           0.72        32
   macro avg       0.71      0.70      0.70        32
weighted avg       0.72      0.72      0.72        32



### Section 1.4. Data Preprocessing using normal split (first 128 rows as train, rest 32 as test)

In [68]:
X_train = df_PlayerRatings.drop(['Team_0 (started DFD 1st H)','Team_1 (started ATK 1st H)','Winner'],axis=1)
X_train = X_train.iloc[0:128]
X_test = df_PlayerRatings.drop(['Team_0 (started DFD 1st H)','Team_1 (started ATK 1st H)','Winner'],axis=1)
X_test = X_test.iloc[128:160]
y_train = df_PlayerRatings.loc[0:127,'Winner']
y_test = df_PlayerRatings.loc[128:159,'Winner']
y_train=y_train.astype('int')
y_test=y_test.astype('int')

### Section 1.5. Prediction results using  normal split

#### Logistic Regression

In [69]:
pipeline_LogReg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100))
pipeline_LogReg.fit(X_train, y_train)
pred_LogReg = pipeline_LogReg.predict(X_test)
print(classification_report(y_test,pred_LogReg))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        16
           1       0.87      0.81      0.84        16

    accuracy                           0.84        32
   macro avg       0.85      0.84      0.84        32
weighted avg       0.85      0.84      0.84        32



#### Stochastic Gradient Descent Classifier

In [70]:
pipeline_SGD = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=1000))
pipeline_SGD.fit(X_train, y_train)
pred_SGD = pipeline_SGD.predict(X_test)
print(classification_report(y_test,pred_SGD))

              precision    recall  f1-score   support

           0       0.68      0.81      0.74        16
           1       0.77      0.62      0.69        16

    accuracy                           0.72        32
   macro avg       0.73      0.72      0.72        32
weighted avg       0.73      0.72      0.72        32



#### Support Vector Classifier (linear kernel)

In [71]:
pipeline_SVC = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='linear', degree=3, gamma='scale'))
pipeline_SVC.fit(X_train, y_train)
pred_SVC = pipeline_SVC.predict(X_test)
print(classification_report(y_test,pred_SVC))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81        16
           1       0.81      0.81      0.81        16

    accuracy                           0.81        32
   macro avg       0.81      0.81      0.81        32
weighted avg       0.81      0.81      0.81        32



#### Random Forest Classifier

In [72]:
pipeline_RF = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=None))
pipeline_RF.fit(X_train, y_train)
pred_RF = pipeline_RF.predict(X_test)
print(classification_report(y_test,pred_RF))

              precision    recall  f1-score   support

           0       0.71      0.94      0.81        16
           1       0.91      0.62      0.74        16

    accuracy                           0.78        32
   macro avg       0.81      0.78      0.78        32
weighted avg       0.81      0.78      0.78        32



#### Extra Tree Classifier

In [73]:
pipeline_ET = make_pipeline(StandardScaler(), ExtraTreeClassifier())
pipeline_ET.fit(X_train, y_train)
pred_ET = pipeline_ET.predict(X_test)
print(classification_report(y_test,pred_ET))

              precision    recall  f1-score   support

           0       0.50      0.62      0.56        16
           1       0.50      0.38      0.43        16

    accuracy                           0.50        32
   macro avg       0.50      0.50      0.49        32
weighted avg       0.50      0.50      0.49        32



## Section 2: Prediction using match statistics data

### Section 2.1. Data Read

In [74]:
df_MatchData = pd.read_csv(r'dataMatchStats.csv')
df_MatchData.head(10)

Unnamed: 0,Team_0 (started DFD 1st H),Team_1 (started ATK 1st H),Bind,Haven,Split,Ascent,Icebox,Breeze,Team_0 ACS,Team_0 K,...,Team_1 K,Team_1 D,Team_1 A,Team_1 KD,Team_1 ADR,Team_1 HS,Team_1 ESR,Team_1 FB,Team_1 FD,Winner
0,Kansas City Pioneers,Virtuoso,0,0,0,1,0,0,240.8,13.8,...,8.2,13.8,2.8,0.594,89.42,22.6,31.0,1.2,2.0,0
1,Virtuoso,Kansas City Pioneers,1,0,0,0,0,0,146.2,8.6,...,16.0,8.6,3.6,1.86,133.76,29.4,73.4,2.8,0.8,1
2,Rise,Cloud9 Blue,0,0,1,0,0,0,237.4,17.0,...,12.8,17.0,5.8,0.753,114.64,18.8,48.8,1.6,2.4,0
3,Cloud9 Blue,Rise,0,0,0,1,0,0,196.0,16.0,...,18.2,16.0,6.4,1.137,142.28,24.4,62.6,2.8,2.0,1
4,Luminosity Gaming,Version1,0,1,0,0,0,0,205.0,15.0,...,16.0,15.0,7.4,1.067,134.58,20.2,28.8,1.8,2.4,1
5,Version1,Luminosity Gaming,0,0,1,0,0,0,223.2,16.6,...,13.4,16.6,5.2,0.807,135.24,25.8,49.0,2.6,1.6,0
6,Andbox,GenG Esports,0,0,0,0,0,1,169.2,10.0,...,15.8,10.0,5.6,1.58,151.42,28.4,56.0,2.6,1.0,1
7,Andbox,GenG Esports,0,0,0,0,1,0,206.6,18.4,...,18.6,18.4,8.8,1.011,138.84,17.4,59.2,2.8,2.4,0
8,GenG Esports,Andbox,0,0,1,0,0,0,231.4,15.8,...,11.0,15.8,3.0,0.696,117.74,16.6,24.8,1.0,2.8,0
9,Noble,100 Thieves,0,1,0,0,0,0,80.4,14.6,...,16.4,14.6,5.4,1.123,52.04,15.8,58.0,1.0,0.8,1


### Section 2.2. Data Preprocessing using train_test_split()

In [75]:
X_train, X_test, y_train, y_test = train_test_split(df_MatchData.drop(['Team_0 (started DFD 1st H)','Team_1 (started ATK 1st H)','Winner'],axis=1), df_MatchData['Winner'], test_size=0.20, random_state=42)
y_train=y_train.astype('int')
y_test=y_test.astype('int')

### Section 2.3. Prediction results using train_test_split()

#### Logistic Regression

In [76]:
pipeline_LogReg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100)) 
pipeline_LogReg.fit(X_train, y_train)
pred_LogReg = pipeline_LogReg.predict(X_test)
print(classification_report(y_test,pred_LogReg))

              precision    recall  f1-score   support

           0       0.81      1.00      0.90        13
           1       1.00      0.84      0.91        19

    accuracy                           0.91        32
   macro avg       0.91      0.92      0.91        32
weighted avg       0.92      0.91      0.91        32



#### Stochastic Gradient Descent Classifier

In [77]:
pipeline_SGD = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=1000))
pipeline_SGD.fit(X_train, y_train)
pred_SGD = pipeline_SGD.predict(X_test)
print(classification_report(y_test,pred_SGD))

              precision    recall  f1-score   support

           0       0.80      0.92      0.86        13
           1       0.94      0.84      0.89        19

    accuracy                           0.88        32
   macro avg       0.87      0.88      0.87        32
weighted avg       0.88      0.88      0.88        32



#### Support Vector Classifier (linear kernel)

In [78]:
pipeline_SVC = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='linear', degree=3, gamma='scale'))
pipeline_SVC.fit(X_train, y_train)
pred_SVC = pipeline_SVC.predict(X_test)
print(classification_report(y_test,pred_SVC))

              precision    recall  f1-score   support

           0       0.75      0.92      0.83        13
           1       0.94      0.79      0.86        19

    accuracy                           0.84        32
   macro avg       0.84      0.86      0.84        32
weighted avg       0.86      0.84      0.85        32



#### Random Forest Classifier

In [79]:
pipeline_RF = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=None))
pipeline_RF.fit(X_train, y_train)
pred_RF = pipeline_RF.predict(X_test)
print(classification_report(y_test,pred_RF))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       1.00      0.95      0.97        19

    accuracy                           0.97        32
   macro avg       0.96      0.97      0.97        32
weighted avg       0.97      0.97      0.97        32



#### Extra Tree Classifier

In [80]:
pipeline_ET = make_pipeline(StandardScaler(), ExtraTreeClassifier())
pipeline_ET.fit(X_train, y_train)
pred_ET = pipeline_ET.predict(X_test)
print(classification_report(y_test,pred_ET))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        13
           1       0.94      0.89      0.92        19

    accuracy                           0.91        32
   macro avg       0.90      0.91      0.90        32
weighted avg       0.91      0.91      0.91        32



### Section 2.4. Data Preprocessing using normal split (first 128 rows as train, rest 32 as test)

In [81]:
X_train = df_MatchData.drop(['Team_0 (started DFD 1st H)','Team_1 (started ATK 1st H)','Winner'],axis=1)
X_train = X_train.iloc[0:128]
X_test = df_MatchData.drop(['Team_0 (started DFD 1st H)','Team_1 (started ATK 1st H)','Winner'],axis=1)
X_test = X_test.iloc[128:160]
y_train = df_MatchData.loc[0:127,'Winner']
y_test = df_MatchData.loc[128:159,'Winner']
y_train=y_train.astype('int')
y_test=y_test.astype('int')

### Section 2.5. Prediction results using normal split

#### Logistic Regression

In [82]:
pipeline_LogReg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100)) 
pipeline_LogReg.fit(X_train, y_train)
pred_LogReg = pipeline_LogReg.predict(X_test)
print(classification_report(y_test,pred_LogReg))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.94      0.97        16

    accuracy                           0.97        32
   macro avg       0.97      0.97      0.97        32
weighted avg       0.97      0.97      0.97        32



#### Stochastic Gradient Descent Classifier

In [83]:
pipeline_SGD = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=1000))
pipeline_SGD.fit(X_train, y_train)
pred_SGD = pipeline_SGD.predict(X_test)
print(classification_report(y_test,pred_SGD))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        16
           1       0.87      0.81      0.84        16

    accuracy                           0.84        32
   macro avg       0.85      0.84      0.84        32
weighted avg       0.85      0.84      0.84        32



#### Support Vector Classifier (linear kernel)

In [84]:
pipeline_SVC = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='linear', degree=3, gamma='scale'))
pipeline_SVC.fit(X_train, y_train)
pred_SVC = pipeline_SVC.predict(X_test)
print(classification_report(y_test,pred_SVC))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.88      0.93        16

    accuracy                           0.94        32
   macro avg       0.94      0.94      0.94        32
weighted avg       0.94      0.94      0.94        32



#### Random Forest Classifier

In [85]:
pipeline_RF = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=None))
pipeline_RF.fit(X_train, y_train)
pred_RF = pipeline_RF.predict(X_test)
print(classification_report(y_test,pred_RF))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.94      0.97        16

    accuracy                           0.97        32
   macro avg       0.97      0.97      0.97        32
weighted avg       0.97      0.97      0.97        32



#### Extra Tree Classifier

In [86]:
pipeline_ET = make_pipeline(StandardScaler(), ExtraTreeClassifier())
pipeline_ET.fit(X_train, y_train)
pred_ET = pipeline_ET.predict(X_test)
print(classification_report(y_test,pred_ET))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91        16
           1       0.93      0.88      0.90        16

    accuracy                           0.91        32
   macro avg       0.91      0.91      0.91        32
weighted avg       0.91      0.91      0.91        32



## Section 3: Comparative Analysis

In [90]:
data = {
    "Algorithm": ["Logistic Regression", "SGD Classifier", "SVC Linear", "Random Forest", "Extra Tree"],
    "Player Ratings (train_test_split)": [0.62, 0.69, 0.56, 0.62, 0.72],
    "Player Ratings (normal_split)": [0.84, 0.72, 0.81, 0.78, 0.50],
    "Match Data (train_test_split)": [0.91, 0.88, 0.84, 0.97, 0.91],
    "Match Data (normal_split)": [0.97, 0.84, 0.94, 0.97, 0.91]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Algorithm,Player Ratings (train_test_split),Player Ratings (normal_split),Match Data (train_test_split),Match Data (normal_split)
0,Logistic Regression,0.62,0.84,0.91,0.97
1,SGD Classifier,0.69,0.72,0.88,0.84
2,SVC Linear,0.56,0.81,0.84,0.94
3,Random Forest,0.62,0.78,0.97,0.97
4,Extra Tree,0.72,0.5,0.91,0.91
