Predict trophies based on player data such as cards, achievements, badges, total games, total donations, etc.

In [1]:
import pandas as pd
from google.colab import files

# Upload the CSV file to Colab
uploaded = files.upload()

# Load the CSV file into a DataFrame
player_data_df = pd.read_csv('combined_player_tags_master.csv')

Saving combined_player_tags_master.csv to combined_player_tags_master.csv


In [13]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# X contains features (cards owned, card levels, achievements, badges, total games, total donations, etc)
# y contains the target variable (number of trophies)
X = player_data_df.copy()
X.drop("/data/trophies", axis=1, inplace=True)
y = player_data_df["/data/trophies"].values.reshape(-1, 1)
y[:100]

X_encoded = pd.get_dummies(X)
y_ravel = y.ravel()

In [4]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_ravel, test_size=0.3, random_state=42)

# Model Selection
model = RandomForestRegressor(random_state=42)

# Training
model.fit(X_train, y_train)
q
# Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 9850.06293191338


In [6]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_ravel, test_size=0.3, random_state=42)

# Model Selection
model_1 = RandomForestRegressor(random_state=42)
model_2 = RandomForestRegressor(n_estimators=150, random_state=42)

# Training
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)

# Evaluation
y_pred_1 = model_1.predict(X_test)
y_pred_2 = model_2.predict(X_test)
mse_1 = mean_squared_error(y_test, y_pred_1)
mse_2 = mean_squared_error(y_test, y_pred_2)
print("Mean Squared Error (Model 1):", mse_1)
print("Mean Squared Error (Model 2):", mse_2)

Mean Squared Error (Model 1): 9850.06293191338
Mean Squared Error (Model 2): 9910.422386021377


In [8]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_ravel, test_size=0.3, random_state=42)

# Model Selection
model_3 = RandomForestRegressor(n_estimators=50, random_state=42)

# Training
model_3.fit(X_train, y_train)

# Evaluation
y_pred_3 = model_3.predict(X_test)
mse_3 = mean_squared_error(y_test, y_pred_3)
print("Mean Squared Error (Model 3):", mse_3)

Mean Squared Error (Model 3): 10152.493593042243


In [14]:
# R-squared calculation
r2_1 = r2_score(y_test, y_pred_1)
r2_2 = r2_score(y_test, y_pred_2)
r2_3 = r2_score(y_test, y_pred_3)

print("R-squared (Model 1):", r2_1)
print("R-squared (Model 2):", r2_2)
print("R-squared (Model 3):", r2_3)

R-squared (Model 1): 0.9978581520252665
R-squared (Model 2): 0.997845027157392
R-squared (Model 3): 0.9977923899582103


The following cells took too long to complete. Next steps would be to:
* Increase processing power and time.
* Perform cross-validation to estimate the models performance and predictive power.
* Examine feature importance scores to identify the most influnetial features and underlying relationships in the data.
* Run the model using RandomizedSearchCV with parameters.
* Run the model using GridSearchCV and even more parameters.

In [None]:
# Perform cross-validation
cv_scores_1 = cross_val_score(model_1, X_encoded, y_ravel, cv=5, scoring='r2')
cv_scores_2 = cross_val_score(model_2, X_encoded, y_ravel, cv=5, scoring='r2')
cv_scores_3 = cross_val_score(model_3, X_encoded, y_ravel, cv=5, scoring='r2')

# Print cross-validation scores
print("Cross-validation R-squared scores (Model 1):", cv_scores_1)
print("Mean R-squared (Model 1):", np.mean(cv_scores_1))
print("Standard deviation of R-squared (Model 1):", np.std(cv_scores_1))

print("Cross-validation R-squared scores (Model 2):", cv_scores_2)
print("Mean R-squared (Model 2):", np.mean(cv_scores_2))
print("Standard deviation of R-squared (Model 2):", np.std(cv_scores_2))

print("Cross-validation R-squared scores (Model 3):", cv_scores_3)
print("Mean R-squared (Model 3):", np.mean(cv_scores_3))
print("Standard deviation of R-squared (Model 3):", np.std(cv_scores_3))

In [None]:
# Examine feature importance
feature_names = X_encoded.columns
std = None

importances_1 = model_1.feature_importances_
forest_importances_1 = pd.Series(importances_1, index=feature_names)

fig, ax = plt.subplots()
forest_importances_1.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI (Model 1)")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
# Use RandomizedSearchCV with parameters
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_ravel, test_size=0.3, random_state=42)

# Model Selection
model = RandomForestRegressor(random_state=42)

# Training
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Define a grid of hyperparameters to search
param_distributions = {
    'n_estimators': [100, 300],
    'max_depth': [None, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

# Randomized Search
n_iter_search = 10  # You can adjust this value
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=n_iter_search, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print("Best Mean Squared Error:", mse_best)
print("Best Parameters:", random_search.best_params_)


In [None]:
# Use GridSearchCV with additional parameters
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_ravel, test_size=0.3, random_state=42)

# Model Selection
model = RandomForestRegressor(random_state=42)

# Training
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print("Best Mean Squared Error:", mse_best)
print("Best Parameters:", grid_search.best_params_)