<h1 style='color:blue' align='center'>Data Science Regression Project: Predicting Tetra Rating(TR) in Tetra League </h1>

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import math
import os
from datetime import datetime, timedelta
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

<h2 style='color:blue'>Data Load: Load Tetra League Leaderboard dataframe</h2>

In [2]:
# Use tetr.io api to get tetra league leaderboard as of 4/10/2023
# https://tetr.io/about/api/ for more information about the api
response = requests.get("https://ch.tetr.io/api/users/lists/league/all")
data = response.json()
df = pd.DataFrame(data)
df1 = pd.json_normalize(df['data']).T
df2 = pd.json_normalize(df1[0],sep =',')
df2.head()

Unnamed: 0,_id,username,role,xp,supporter,verified,country,league.gamesplayed,league.gameswon,league.rating,league.glicko,league.rd,league.rank,league.bestrank,league.apm,league.pps,league.vs,league.decaying
0,615fb20fe17beeef45104302,blaarg,user,5250399.0,False,True,US,1699,1467,24999.224889,4236.64732,86.675643,x,x,183.28,3.64,375.26,False
1,5e844b0868270e617d52c990,czsmall0402,user,15560898.5,True,True,,420,395,24999.016611,4190.174026,98.23822,x,x,177.49,3.06,358.75,False
2,5f5dbcc4454e942b4fdfc5fa,vincehd,user,12532852.0,True,True,PH,1166,791,24998.585144,4068.907981,72.285643,x,x,167.85,3.58,350.71,False
3,5e88d0ead351fa71316ba29e,promooooooo,user,9432395.0,True,True,AQ,3602,2249,24998.415815,4036.927128,69.403356,x,x,190.62,4.02,384.17,False
4,5e47696db7c60f23a497ee6c,caboozled_pie,user,17162375.5,True,True,US,6467,4012,24998.115121,3987.803568,64.644785,x,x,180.78,3.56,358.68,False


**Create a folder for the different versions of the leaderboard used**

In [3]:
dataset_version=max([int(i) for i in os.listdir("dataset") + [0]])+1
df2.to_csv(f"dataset/{dataset_version}.csv",index=False)

**Drop features that are not necessary to build our model**

In [4]:
df3 = df2.drop([
        '_id','username','role','country',
        'league.rank','league.decaying','verified','league.rd'
], axis='columns') 
df3.head()

Unnamed: 0,xp,supporter,league.gamesplayed,league.gameswon,league.rating,league.glicko,league.bestrank,league.apm,league.pps,league.vs
0,5250399.0,False,1699,1467,24999.224889,4236.64732,x,183.28,3.64,375.26
1,15560898.5,True,420,395,24999.016611,4190.174026,x,177.49,3.06,358.75
2,12532852.0,True,1166,791,24998.585144,4068.907981,x,167.85,3.58,350.71
3,9432395.0,True,3602,2249,24998.415815,4036.927128,x,190.62,4.02,384.17
4,17162375.5,True,6467,4012,24998.115121,3987.803568,x,180.78,3.56,358.68


<h2 style='color:blue'>Feature Engineering</h2>

**Add new feature for TL win %**

In [5]:
df3['league.winper']  = df3['league.gameswon']/df3['league.gamesplayed']
df3.head()

Unnamed: 0,xp,supporter,league.gamesplayed,league.gameswon,league.rating,league.glicko,league.bestrank,league.apm,league.pps,league.vs,league.winper
0,5250399.0,False,1699,1467,24999.224889,4236.64732,x,183.28,3.64,375.26,0.863449
1,15560898.5,True,420,395,24999.016611,4190.174026,x,177.49,3.06,358.75,0.940476
2,12532852.0,True,1166,791,24998.585144,4068.907981,x,167.85,3.58,350.71,0.678388
3,9432395.0,True,3602,2249,24998.415815,4036.927128,x,190.62,4.02,384.17,0.624375
4,17162375.5,True,6467,4012,24998.115121,3987.803568,x,180.78,3.56,358.68,0.62038


**Replace XP for Levels**

In [6]:
#Formula for levels found here https://tetrio.team2xh.net/?t=faq#xp

def calculate_level(xp):
    level = (xp/500)**0.6 + xp / (5000 + max(0, xp - 4*(10**6)) / 5000) + 1
    return math.floor(level)  
df3['level'] = df3['xp'].apply(calculate_level)
df3.drop('xp',axis='columns',inplace=True)
df3.head()

Unnamed: 0,supporter,league.gamesplayed,league.gameswon,league.rating,league.glicko,league.bestrank,league.apm,league.pps,league.vs,league.winper,level
0,False,1699,1467,24999.224889,4236.64732,x,183.28,3.64,375.26,0.863449,1259
1,True,420,395,24999.016611,4190.174026,x,177.49,3.06,358.75,0.940476,2625
2,True,1166,791,24998.585144,4068.907981,x,167.85,3.58,350.71,0.678388,2305
3,True,3602,2249,24998.415815,4036.927128,x,190.62,4.02,384.17,0.624375,1918
4,True,6467,4012,24998.115121,3987.803568,x,180.78,3.56,358.68,0.62038,2776


**Drop the the bottom 5% of people in games played. This is done to avoid players who have not played many games and may not be ranked as accurately**

In [7]:
df3['league.gamesplayed'].quantile(0.05,interpolation='linear')

25.0

In [8]:
df4 = df3[~(df3['league.gamesplayed']<=25)]
df4.shape

(44071, 11)

**Convert Support column to binary**

In [9]:
#Convert True/False columns to binary columns so model can interpret them
df4['supporter'] = df4['supporter'].astype(int)
df4.head()

Unnamed: 0,supporter,league.gamesplayed,league.gameswon,league.rating,league.glicko,league.bestrank,league.apm,league.pps,league.vs,league.winper,level
0,0,1699,1467,24999.224889,4236.64732,x,183.28,3.64,375.26,0.863449,1259
1,1,420,395,24999.016611,4190.174026,x,177.49,3.06,358.75,0.940476,2625
2,1,1166,791,24998.585144,4068.907981,x,167.85,3.58,350.71,0.678388,2305
3,1,3602,2249,24998.415815,4036.927128,x,190.62,4.02,384.17,0.624375,1918
4,1,6467,4012,24998.115121,3987.803568,x,180.78,3.56,358.68,0.62038,2776


<h2 style='color:blue'>Use One Hot Encoding For a Player's Best Rank</h2>

In [10]:
dummies = pd.get_dummies(df4['league.bestrank'])
df5 = pd.concat([df4.drop('league.bestrank',axis='columns'),dummies.drop('x',axis='columns')],axis='columns')
df5.columns

Index(['supporter', 'league.gamesplayed', 'league.gameswon', 'league.rating',
       'league.glicko', 'league.apm', 'league.pps', 'league.vs',
       'league.winper', 'level', 'a', 'a+', 'a-', 'b', 'b+', 'b-', 'c', 'c+',
       'c-', 'd', 'd+', 's', 's+', 's-', 'ss', 'u'],
      dtype='object')

<h2 style='color:blue'>Building the Prediction Model</h2>

**Creating the target variable (Y) and the input features (X)**

In [11]:
X = df5.drop(['league.rating','league.glicko'],axis='columns')
y = df5['league.glicko']
X.head()

Unnamed: 0,supporter,league.gamesplayed,league.gameswon,league.apm,league.pps,league.vs,league.winper,level,a,a+,...,c,c+,c-,d,d+,s,s+,s-,ss,u
0,0,1699,1467,183.28,3.64,375.26,0.863449,1259,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,420,395,177.49,3.06,358.75,0.940476,2625,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1166,791,167.85,3.58,350.71,0.678388,2305,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3602,2249,190.62,4.02,384.17,0.624375,1918,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,6467,4012,180.78,3.56,358.68,0.62038,2776,0,0,...,0,0,0,0,0,0,0,0,0,0


**Split the variables for 80% of the data be used for training and 20% be used for testing**

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [13]:
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.9792015036610403

**Linear Regression gave a score over 97.5%, but I'd like to see if we can get any other ML model or other parameters that lead to higher accuracy. We will use GridSearchCV and K-fold cross validation to to do so.**

In [14]:
model_params =  {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
def find_best_model_using_gridsearchcv(X,y):
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in model_params.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_,
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.978731,{'normalize': True}
1,lasso,0.971265,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.967394,"{'criterion': 'mse', 'splitter': 'best'}"


**All 3 criteria have around the same score, so I will test with outliers to see which of the three handle them better**

In [15]:
lr_clf = LinearRegression(normalize=True)
lr_clf.fit(X,y)

LinearRegression(normalize=True)

In [16]:
lasso_clf = Lasso(alpha=1,selection='random')
lasso_clf.fit(X,y)

Lasso(alpha=1, selection='random')

In [17]:
dt_clf = DecisionTreeRegressor(criterion='mse',splitter='best')
dt_clf.fit(X,y)

DecisionTreeRegressor(criterion='mse')

<h2 style='color:blue'>Test the models with Outliers</h2>

In [18]:
def predict_TR(user):
    response = requests.get(f"https://ch.tetr.io/api/users/{user}")
    data_predict = response.json()
    df_predict1 = pd.DataFrame(data_predict)
    df_predict2 = pd.json_normalize(df_predict1['data'])
    
    x = pd.DataFrame(data=np.zeros((1, len(X_train.columns))), columns=X_train.columns)
    
    if (df_predict2['supporter_tier'].iloc[0]>0):
        x['supporter'] = 1
    else:
        x['supporter'] = 0
    
    x['league.gamesplayed'] = df_predict2['league.gamesplayed'].iloc[0]
    x['league.gameswon'] = df_predict2['league.gameswon'].iloc[0]
    x['league.apm'] = df_predict2['league.apm'].iloc[0]
    x['league.pps'] = df_predict2['league.pps'].iloc[0]
    x['league.vs'] = df_predict2['league.vs'].iloc[0]
    x['league.winper'] = df_predict2['league.gameswon'].iloc[0]/df_predict2['league.gamesplayed'].iloc[0]
    x['level'] = calculate_level(df_predict2['xp'].iloc[0])
    
    rank = df_predict2['league.bestrank'].iloc[0]
    if ((rank != 'x')):
        x[rank]=1
    rd = get_RD(df_predict2['_id'][0],df_predict2['league.rd'][0])
    lr_predicted = calculate_TR(lr_clf.predict(x)[0],rd)
    lasso_predicted = calculate_TR(lasso_clf.predict(x)[0],rd)
    dt_predicted = calculate_TR(dt_clf.predict(x)[0],rd)
    actual = round(df_predict2['league.rating'].iloc[0],2)
    compare = calculate_TR(df_predict2['league.glicko'][0],rd)

    print(f"{'Actual TR:':<22} {actual} \n{'Estimated TR:':<22} {compare}\n{'Linear Regression TR:':<22} {lr_predicted}\n{'Lasso TR:':<22} {lasso_predicted}\n{'Decision Tree TR:':<22} {dt_predicted}")

def calculate_TR(glicko,rd):
    TR = round(25000/(1+np.power(10, (1500-glicko)*np.pi/(np.sqrt(3*np.log(10)*np.log(10)*rd*rd+2500*(64*np.pi*np.pi+147*np.log(10)*np.log(10)))))), 2)
    return TR

def get_RD(user_id, rd):
    response = requests.get(f"https://ch.tetr.io/api/streams/league_userrecent_{user_id}")
    user_data = response.json()
    df_user = pd.DataFrame(user_data)
    df_user1 = pd.json_normalize(df_user['data']).T
    df_user2 = pd.json_normalize(df_user1[0],sep =',')
    try: 
        date = datetime.strptime(df_user2['ts'][0], '%Y-%m-%dT%H:%M:%S.%fZ')
    except:
        return (df2['league.rd'].median())

    now = datetime.utcnow()
    week_ago = now - timedelta(weeks=1)

    if date < week_ago:
        day_difference = now - date
        rd_change = day_difference.days
        return rd-rd_change
    else:
        return rd


**Using players who are unrank and not part of the dataset**

In [19]:
predict_TR('icly')

Actual TR:             24999.33 
Estimated TR:          24999.31
Linear Regression TR:  24994.19
Lasso TR:              24997.33
Decision Tree TR:      24998.79


In [20]:
predict_TR('quickandsmart')

Actual TR:             24948.11 
Estimated TR:          24948.94
Linear Regression TR:  24847.18
Lasso TR:              24903.57
Decision Tree TR:      24931.38


In [21]:
predict_TR('kiken')

Actual TR:             267.08 
Estimated TR:          256.73
Linear Regression TR:  795.47
Lasso TR:              896.89
Decision Tree TR:      894.74


In [22]:
predict_TR('atombolders')

Actual TR:             408.87 
Estimated TR:          392.15
Linear Regression TR:  227.56
Lasso TR:              1192.32
Decision Tree TR:      153.94


**While all models seem to be around the same in accuracy, Decision Tree seems to handle outliers slightly better so that model will be used**

<h2 style='color:blue'>Export the model to a pickle file</h2>

In [23]:
import pickle
with open('tr_prediction_model.pickle','wb') as f:
    pickle.dump(dt_clf,f)

<h2 style='color:blue'>Export location and column information to a file that will be useful later on in prediction application</h2>


In [24]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))