<a href="https://colab.research.google.com/github/samueleallen/Scraping-Val-Data/blob/main/ValorantMatchPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1: Cleaning/Prepping data for Machine Learning
Task List:
 * Aggregate player data to one row for each team  
 * Convert column types appropriately (e.g., object type to float for R2.0 feature)
 * Investigate missing data

In [41]:
matches_df = pd.read_csv("/content/drive/My Drive/Colab CSV Files/overall_game_stats.csv")
matches_df.head()

Unnamed: 0,Player,R2.0,ACS,K,D,A,+/- K/D,KAST,ADR,HS%,FK,FD,+/- FK/FD,date,Team,vs Team,result,final_score
0,jawgemo,1.18,243.0,103,89,44,14,78%,160.0,28%,12.0,20.0,-8.0,3/2/2025,G2 Esports,T1,Loss,'2 : 3'
1,trent,1.11,224.0,92,79,40,13,75%,145.0,29%,14.0,9.0,5.0,3/2/2025,G2 Esports,T1,Loss,'2 : 3'
2,valyn,1.02,210.0,94,82,51,12,77%,138.0,25%,10.0,10.0,0.0,3/2/2025,G2 Esports,T1,Loss,'2 : 3'
3,leaf,0.81,190.0,79,86,31,-7,75%,119.0,21%,13.0,15.0,-2.0,3/2/2025,G2 Esports,T1,Loss,'2 : 3'
4,JonahP,0.92,162.0,74,87,42,-13,75%,105.0,28%,7.0,11.0,-4.0,3/2/2025,G2 Esports,T1,Loss,'2 : 3'


In [42]:
# Define which columns should be aggregated using the mean
numeric_columns = ['R2.0', 'ACS', 'K', 'D', 'A', '+/- K/D', 'KAST', 'ADR', 'HS%', 'FK', 'FD', '+/- FK/FD']

# Convert date column from object to datetime
matches_df["date"] = pd.to_datetime(matches_df["date"])

# Convert percentage columns to numeric values (remove "%" and convert to float)
matches_df["KAST"] = matches_df["KAST"].str.rstrip('%').astype(float)
matches_df["HS%"] = matches_df["HS%"].str.rstrip('%').astype(float)

# Group each team match into one row
matches_df = matches_df.groupby(['date', 'Team', 'vs Team', 'result', 'final_score']).agg({col: 'mean' for col in numeric_columns}).reset_index()
matches_df.sort_values(by='date')
# Save to CSV file
# matches_df.to_csv('aggregated_matches.csv', index=False)
matches_df.head()


Unnamed: 0,date,Team,vs Team,result,final_score,R2.0,ACS,K,D,A,+/- K/D,KAST,ADR,HS%,FK,FD,+/- FK/FD
0,2023-02-13,DetonatioN FocusMe,Giants Gaming,Loss,'0 : 2',0.768333,166.5,17.333333,24.833333,6.833333,-7.5,60.0,107.333333,33.0,2.5,3.666667,-1.166667
1,2023-02-13,Giants Gaming,DetonatioN FocusMe,Win,'2 : 0',1.248,218.0,29.8,20.8,11.0,9.0,83.6,148.8,30.4,4.4,3.0,1.4
2,2023-02-13,KOI,NRG Esports,Loss,'0 : 2',1.034,170.6,27.0,30.4,12.6,-3.4,68.0,113.0,25.6,3.2,6.0,-2.8
3,2023-02-13,NRG Esports,KOI,Win,'2 : 0',1.198,196.4,30.4,27.0,13.8,3.4,77.2,126.8,24.0,6.0,3.2,2.8
4,2023-02-14,BBL Esports,DRX,Loss,'1 : 2',0.876,175.8,37.6,45.8,15.2,-8.2,67.2,117.0,28.4,5.2,6.8,-1.6


In [43]:
matches_df.shape

(1536, 17)

In [44]:
matches_df.dtypes

Unnamed: 0,0
date,datetime64[ns]
Team,object
vs Team,object
result,object
final_score,object
R2.0,float64
ACS,float64
K,float64
D,float64
A,float64


In [45]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Handle missing values
matches_df.fillna(0, inplace=True)

# Convert percentage columns like 'HS%' and 'KAST' to numeric
matches_df['HS%'] = pd.to_numeric(matches_df['HS%'], errors='coerce')  # Convert to numeric, handle errors by coercing to NaN
matches_df['KAST'] = pd.to_numeric(matches_df['KAST'], errors='coerce')

# Encode categorical columns like 'Player', 'Team', 'vs Team'
label_encoder = LabelEncoder()

# Convert 'final_score' to two separate columns for individual scores
matches_df[['final_score_winner', 'final_score_loser']] = matches_df['final_score'].str.split(':', expand=True)

# Convert the split columns to numeric
matches_df['final_score_winner'] = pd.to_numeric(matches_df['final_score_winner'], errors='coerce')
matches_df['final_score_loser'] = pd.to_numeric(matches_df['final_score_loser'], errors='coerce')

# Drop 'final_score' column as it is now split into 'final_score_winner' and 'final_score_loser'
matches_df.drop(columns=['final_score'], inplace=True)

# Convert 'result' to integers (0 for Loss, 1 for Win)
matches_df['result'] = matches_df['result'].replace({'Loss': 0, 'Win': 1})

# Check the result
matches_df.dtypes

  matches_df['result'] = matches_df['result'].replace({'Loss': 0, 'Win': 1})


Unnamed: 0,0
date,datetime64[ns]
Team,object
vs Team,object
result,int64
R2.0,float64
ACS,float64
K,float64
D,float64
A,float64
+/- K/D,float64


## Step 2: Creating Initial Machine Learning Model

### 1. Creating Predictors for Machine Learning
 * Select relevant pre-match features
 * Calculate past performance stats (e.g., rolling averages)
 * Ensure no data leakage from test set

### 2. Creating Initial Machine Learning model
 * Split data into training and test sets
 * Initialize machine learning model
 * Train the model using `train` data

In [46]:
from sklearn.ensemble import RandomForestClassifier
# maybe experiment with these values
rf = RandomForestClassifier(n_estimators=50, min_samples_split = 10, random_state = 1)

In [47]:
# Filter the dataset based on date
train = matches_df[matches_df["date"] < '2025-02-19']
test = matches_df[matches_df["date"] >= '2025-02-19']

# Convert 'date' column to numeric (days since the first date)
train['date'] = (train['date'] - train['date'].min()).dt.days
test['date'] = (test['date'] - test['date'].min()).dt.days


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['date'] = (train['date'] - train['date'].min()).dt.days
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['date'] = (test['date'] - test['date'].min()).dt.days


In [48]:
predictors = ['K', 'D', 'A', '+/- K/D', 'ADR', 'HS%', 'FK', 'FD', 'KAST', 'date']
# removed ['Team', 'vs Team'] for

In [49]:
print(test[predictors].dtypes)


K          float64
D          float64
A          float64
+/- K/D    float64
ADR        float64
HS%        float64
FK         float64
FD         float64
KAST       float64
date         int64
dtype: object


In [50]:
rf.fit(train[predictors], train["result"])

In [51]:
# Now fit the model and predict
rf.fit(train[predictors], train["result"])
preds = rf.predict(test[predictors])

In [52]:
from sklearn.metrics import accuracy_score

In [53]:
acc = accuracy_score(test["result"], preds)

In [54]:
acc

0.96875

In [55]:
# Lets try to implement rolling averages!

In [56]:
combined = pd.DataFrame(dict(actual=test["result"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16,0
1,1,15


In [57]:
from sklearn.metrics import precision_score

In [58]:
precision_score(test["result"], preds)

1.0

In [59]:
grouped_matches = matches_df.groupby("Team")

In [60]:
# Each team has a unique integer ID rather than a string for team name
group = grouped_matches.get_group("G2 Esports")

In [61]:
group

Unnamed: 0,date,Team,vs Team,result,R2.0,ACS,K,D,A,+/- K/D,KAST,ADR,HS%,FK,FD,+/- FK/FD,final_score_winner,final_score_loser
542,2024-02-18,G2 Esports,KRÜ Esports,1,1.104,203.0,47.4,45.8,19.2,1.6,71.8,132.2,28.2,5.8,7.8,-2.0,,
611,2024-02-24,G2 Esports,Evil Geniuses,0,0.908,196.2,35.8,37.0,13.8,-1.2,72.2,131.8,31.2,4.4,6.0,-1.6,,
623,2024-02-25,G2 Esports,KRÜ Esports,1,1.29,211.8,31.0,26.0,15.8,5.0,79.0,144.2,25.2,4.2,4.2,0.0,,
639,2024-02-26,G2 Esports,MIBR,0,0.892,190.0,48.2,52.6,15.8,-4.4,69.8,128.2,33.2,7.4,7.2,0.2,,
640,2024-02-26,G2 Esports,Sentinels,1,0.848,195.0,45.0,44.0,17.6,1.0,71.8,126.4,27.2,7.0,6.0,1.0,,
744,2024-04-07,G2 Esports,Evil Geniuses,1,1.426,227.6,30.8,22.0,15.0,8.8,78.8,145.6,24.2,4.8,2.6,2.2,,
782,2024-04-13,G2 Esports,100 Thieves,0,0.882,187.2,27.0,30.6,15.0,-3.6,71.4,122.0,25.2,4.0,4.6,-0.6,,
855,2024-04-22,G2 Esports,FURIA,1,1.354,226.8,27.8,15.0,9.8,12.8,88.4,141.2,24.2,4.8,1.6,3.2,,
868,2024-04-24,G2 Esports,LEVIATÁN,0,0.812,181.8,38.6,45.8,10.8,-7.2,65.0,121.4,29.0,5.0,7.4,-2.4,,
892,2024-04-27,G2 Esports,MIBR,1,1.18,225.2,30.8,24.2,11.2,6.6,77.4,154.2,30.2,3.6,4.0,-0.4,,


In [62]:
# Now you can compute rolling averages
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")  # Sort group data by data
    numeric_cols = group[cols].select_dtypes(include=['number'])  # Select numeric columns, ignore dates
    rolling_stats = numeric_cols.rolling(3, closed='left').mean()  # Compute rolling mean
    # Assign the rolling stats back to the group, ensuring the new columns are added correctly
    for i, col in enumerate(rolling_stats.columns):
        group[new_cols[i]] = rolling_stats[col]
    # group = group.dropna(subset=new_cols)  # Drop rows with NaN in the new columns
    return group

In [63]:
cols = ['K', 'D', 'A', '+/- K/D', 'ADR', 'HS%', 'FK', 'FD', 'KAST', 'date']
new_cols = [f"{c}_rolling" for c in cols]

In [64]:
new_cols

['K_rolling',
 'D_rolling',
 'A_rolling',
 '+/- K/D_rolling',
 'ADR_rolling',
 'HS%_rolling',
 'FK_rolling',
 'FD_rolling',
 'KAST_rolling',
 'date_rolling']

In [65]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,Team,vs Team,result,R2.0,ACS,K,D,A,+/- K/D,...,final_score_loser,K_rolling,D_rolling,A_rolling,+/- K/D_rolling,ADR_rolling,HS%_rolling,FK_rolling,FD_rolling,KAST_rolling
542,2024-02-18,G2 Esports,KRÜ Esports,1,1.104,203.0,47.4,45.8,19.2,1.6,...,,,,,,,,,,
611,2024-02-24,G2 Esports,Evil Geniuses,0,0.908,196.2,35.8,37.0,13.8,-1.2,...,,,,,,,,,,
623,2024-02-25,G2 Esports,KRÜ Esports,1,1.29,211.8,31.0,26.0,15.8,5.0,...,,,,,,,,,,
639,2024-02-26,G2 Esports,MIBR,0,0.892,190.0,48.2,52.6,15.8,-4.4,...,,38.066667,36.266667,16.266667,1.8,136.066667,28.2,4.8,6.0,74.333333
640,2024-02-26,G2 Esports,Sentinels,1,0.848,195.0,45.0,44.0,17.6,1.0,...,,38.333333,38.533333,15.133333,-0.2,134.733333,29.866667,5.333333,5.8,73.666667
744,2024-04-07,G2 Esports,Evil Geniuses,1,1.426,227.6,30.8,22.0,15.0,8.8,...,,41.4,40.866667,16.4,0.533333,132.933333,28.533333,6.2,5.8,73.533333
782,2024-04-13,G2 Esports,100 Thieves,0,0.882,187.2,27.0,30.6,15.0,-3.6,...,,41.333333,39.533333,16.133333,1.8,133.4,28.2,6.4,5.266667,73.466667
855,2024-04-22,G2 Esports,FURIA,1,1.354,226.8,27.8,15.0,9.8,12.8,...,,34.266667,32.2,15.866667,2.066667,131.333333,25.533333,5.266667,4.4,74.0
868,2024-04-24,G2 Esports,LEVIATÁN,0,0.812,181.8,38.6,45.8,10.8,-7.2,...,,28.533333,22.533333,13.266667,6.0,136.266667,24.533333,4.533333,2.933333,79.533333
892,2024-04-27,G2 Esports,MIBR,1,1.18,225.2,30.8,24.2,11.2,6.6,...,,31.133333,30.466667,11.866667,0.666667,128.2,26.133333,4.6,4.533333,74.933333


In [66]:
matches_rolling = matches_df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches_df.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [67]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,Team,vs Team,result,R2.0,ACS,K,D,A,+/- K/D,...,final_score_loser,K_rolling,D_rolling,A_rolling,+/- K/D_rolling,ADR_rolling,HS%_rolling,FK_rolling,FD_rolling,KAST_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
100 Thieves,34,2023-02-23,100 Thieves,EDward Gaming,1,1.142,203.0,50.6,50.4,22.2,0.2,...,,,,,,,,,,
100 Thieves,48,2023-02-26,100 Thieves,FUT Esports,1,1.146,216.2,50.4,47.0,28.4,3.4,...,,,,,,,,,,
100 Thieves,52,2023-02-27,100 Thieves,FNATIC,0,0.574,167.6,21.6,31.8,7.4,-10.2,...,,,,,,,,,,
100 Thieves,92,2023-04-01,100 Thieves,Sentinels,0,0.848,185.4,48.8,56.0,23.6,-7.2,...,,40.866667,43.066667,19.333333,-2.200000,132.000000,26.133333,5.133333,6.533333,70.533333
100 Thieves,138,2023-04-10,100 Thieves,Evil Geniuses,1,1.220,224.0,32.2,23.6,14.4,8.6,...,,40.266667,44.933333,19.800000,-4.666667,126.466667,24.866667,6.333333,5.666667,70.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZETA DIVISION,1163,2024-06-29,ZETA DIVISION,Rex Regum Qeon,0,1.088,182.4,27.4,31.0,13.2,-3.6,...,,41.266667,46.733333,21.133333,-5.466667,126.266667,30.600000,5.466667,7.066667,69.933333
ZETA DIVISION,1227,2024-07-06,ZETA DIVISION,TALON,1,0.972,208.2,44.4,38.8,20.8,5.6,...,,34.800000,40.266667,17.800000,-5.466667,123.266667,31.600000,5.133333,5.733333,70.533333
ZETA DIVISION,1239,2024-07-07,ZETA DIVISION,Paper Rex,0,1.056,185.8,38.6,48.0,14.8,-9.4,...,,38.800000,38.933333,18.666667,-0.133333,129.866667,31.933333,5.600000,5.733333,72.266667
ZETA DIVISION,1375,2025-01-18,ZETA DIVISION,Nongshim RedForce,0,0.848,189.8,28.6,33.2,11.0,-4.6,...,,36.800000,39.266667,16.266667,-2.466667,127.200000,31.600000,5.266667,5.733333,69.733333


In [68]:
matches_rolling = matches_rolling.droplevel('Team')

In [69]:
matches_rolling

Unnamed: 0,date,Team,vs Team,result,R2.0,ACS,K,D,A,+/- K/D,...,final_score_loser,K_rolling,D_rolling,A_rolling,+/- K/D_rolling,ADR_rolling,HS%_rolling,FK_rolling,FD_rolling,KAST_rolling
34,2023-02-23,100 Thieves,EDward Gaming,1,1.142,203.0,50.6,50.4,22.2,0.2,...,,,,,,,,,,
48,2023-02-26,100 Thieves,FUT Esports,1,1.146,216.2,50.4,47.0,28.4,3.4,...,,,,,,,,,,
52,2023-02-27,100 Thieves,FNATIC,0,0.574,167.6,21.6,31.8,7.4,-10.2,...,,,,,,,,,,
92,2023-04-01,100 Thieves,Sentinels,0,0.848,185.4,48.8,56.0,23.6,-7.2,...,,40.866667,43.066667,19.333333,-2.200000,132.000000,26.133333,5.133333,6.533333,70.533333
138,2023-04-10,100 Thieves,Evil Geniuses,1,1.220,224.0,32.2,23.6,14.4,8.6,...,,40.266667,44.933333,19.800000,-4.666667,126.466667,24.866667,6.333333,5.666667,70.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,2024-06-29,ZETA DIVISION,Rex Regum Qeon,0,1.088,182.4,27.4,31.0,13.2,-3.6,...,,41.266667,46.733333,21.133333,-5.466667,126.266667,30.600000,5.466667,7.066667,69.933333
1227,2024-07-06,ZETA DIVISION,TALON,1,0.972,208.2,44.4,38.8,20.8,5.6,...,,34.800000,40.266667,17.800000,-5.466667,123.266667,31.600000,5.133333,5.733333,70.533333
1239,2024-07-07,ZETA DIVISION,Paper Rex,0,1.056,185.8,38.6,48.0,14.8,-9.4,...,,38.800000,38.933333,18.666667,-0.133333,129.866667,31.933333,5.600000,5.733333,72.266667
1375,2025-01-18,ZETA DIVISION,Nongshim RedForce,0,0.848,189.8,28.6,33.2,11.0,-4.6,...,,36.800000,39.266667,16.266667,-2.466667,127.200000,31.600000,5.266667,5.733333,69.733333


In [70]:
matches_rolling.index = range(matches_rolling.shape[0])

In [71]:
matches_rolling['date_rolling'] = matches_rolling['date']
matches_rolling

Unnamed: 0,date,Team,vs Team,result,R2.0,ACS,K,D,A,+/- K/D,...,K_rolling,D_rolling,A_rolling,+/- K/D_rolling,ADR_rolling,HS%_rolling,FK_rolling,FD_rolling,KAST_rolling,date_rolling
0,2023-02-23,100 Thieves,EDward Gaming,1,1.142,203.0,50.6,50.4,22.2,0.2,...,,,,,,,,,,2023-02-23
1,2023-02-26,100 Thieves,FUT Esports,1,1.146,216.2,50.4,47.0,28.4,3.4,...,,,,,,,,,,2023-02-26
2,2023-02-27,100 Thieves,FNATIC,0,0.574,167.6,21.6,31.8,7.4,-10.2,...,,,,,,,,,,2023-02-27
3,2023-04-01,100 Thieves,Sentinels,0,0.848,185.4,48.8,56.0,23.6,-7.2,...,40.866667,43.066667,19.333333,-2.200000,132.000000,26.133333,5.133333,6.533333,70.533333,2023-04-01
4,2023-04-10,100 Thieves,Evil Geniuses,1,1.220,224.0,32.2,23.6,14.4,8.6,...,40.266667,44.933333,19.800000,-4.666667,126.466667,24.866667,6.333333,5.666667,70.133333,2023-04-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,2024-06-29,ZETA DIVISION,Rex Regum Qeon,0,1.088,182.4,27.4,31.0,13.2,-3.6,...,41.266667,46.733333,21.133333,-5.466667,126.266667,30.600000,5.466667,7.066667,69.933333,2024-06-29
1532,2024-07-06,ZETA DIVISION,TALON,1,0.972,208.2,44.4,38.8,20.8,5.6,...,34.800000,40.266667,17.800000,-5.466667,123.266667,31.600000,5.133333,5.733333,70.533333,2024-07-06
1533,2024-07-07,ZETA DIVISION,Paper Rex,0,1.056,185.8,38.6,48.0,14.8,-9.4,...,38.800000,38.933333,18.666667,-0.133333,129.866667,31.933333,5.600000,5.733333,72.266667,2024-07-07
1534,2025-01-18,ZETA DIVISION,Nongshim RedForce,0,0.848,189.8,28.6,33.2,11.0,-4.6,...,36.800000,39.266667,16.266667,-2.466667,127.200000,31.600000,5.266667,5.733333,69.733333,2025-01-18


In [72]:
def make_predictions(data, predictors):
  train = data[data["date"] < '2025-01-01']
  test = data[data["date"] >= '2025-01-01']
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["result"], predicted=preds), index=test.index)
  precision = precision_score(test["result"], preds)
  return combined, precision

In [73]:
print(matches_rolling.columns)

Index(['date', 'Team', 'vs Team', 'result', 'R2.0', 'ACS', 'K', 'D', 'A',
       '+/- K/D', 'KAST', 'ADR', 'HS%', 'FK', 'FD', '+/- FK/FD',
       'final_score_winner', 'final_score_loser', 'K_rolling', 'D_rolling',
       'A_rolling', '+/- K/D_rolling', 'ADR_rolling', 'HS%_rolling',
       'FK_rolling', 'FD_rolling', 'KAST_rolling', 'date_rolling'],
      dtype='object')


In [74]:
print(predictors + new_cols)

['K', 'D', 'A', '+/- K/D', 'ADR', 'HS%', 'FK', 'FD', 'KAST', 'date', 'K_rolling', 'D_rolling', 'A_rolling', '+/- K/D_rolling', 'ADR_rolling', 'HS%_rolling', 'FK_rolling', 'FD_rolling', 'KAST_rolling', 'date_rolling']


In [75]:
# predictors_and_new_cols = [col for col in predictors + new_cols if col != 'date']
# combine, precision = make_predictions(matches_rolling, predictors_and_new_cols)