<h1 style='color:blue' align='center'>Data Science Regression Project: Predicting Tetra Rating(TR) in Tetra League </h1>

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

<h2 style='color:blue'>Data Load: Load Tetra League Leaderboard dataframe</h2>

In [2]:
# Use tetr.io api to get tetra league leaderboard as of 3/18/2023
# https://tetr.io/about/api/ for more information about the api
response = requests.get("https://ch.tetr.io/api/users/lists/league/all")
data = response.json()
df = pd.DataFrame(data)
df1 = pd.json_normalize(df['data']).T
df2 = pd.json_normalize(df1[0],sep =',')
df2.head()

Unnamed: 0,_id,username,role,xp,supporter,verified,country,league.gamesplayed,league.gameswon,league.rating,league.glicko,league.rd,league.rank,league.bestrank,league.apm,league.pps,league.vs,league.decaying
0,615fb20fe17beeef45104302,blaarg,user,5236410.0,True,True,US,1692,1460,24999.120525,4203.071256,82.396491,x,x,187.0,3.69,384.42,False
1,5e844b0868270e617d52c990,czsmall0402,user,15397682.5,True,True,,416,391,24999.008863,4186.77139,96.946823,x,x,167.03,2.95,343.59,False
2,5f5dbcc4454e942b4fdfc5fa,vincehd,user,12230792.0,True,True,PH,1153,784,24998.923556,4137.975153,68.946382,x,x,193.84,3.83,399.41,False
3,5f55e925ead0072b56cd42e2,ix1iv,user,1274742.5,True,False,JP,298,259,24998.088133,3995.38386,79.06562,x,x,180.76,3.32,353.58,False
4,61990d780bf7c9b332b2bcb5,togaiats,user,2244814.5,False,False,BR,550,403,24997.863237,3960.417953,89.045073,x,x,169.96,3.89,352.08,True


**Drop features that are not necessary to build our model**

In [3]:
df3 = df2.drop([
        '_id','username','role','country',
        'league.glicko','league.rank','league.decaying','verified','league.rd'
], axis='columns') 
df3.head()

Unnamed: 0,xp,supporter,league.gamesplayed,league.gameswon,league.rating,league.bestrank,league.apm,league.pps,league.vs
0,5236410.0,True,1692,1460,24999.120525,x,187.0,3.69,384.42
1,15397682.5,True,416,391,24999.008863,x,167.03,2.95,343.59
2,12230792.0,True,1153,784,24998.923556,x,193.84,3.83,399.41
3,1274742.5,True,298,259,24998.088133,x,180.76,3.32,353.58
4,2244814.5,False,550,403,24997.863237,x,169.96,3.89,352.08


<h2 style='color:blue'>Feature Engineering</h2>

**Add new feature for TL win %**

In [4]:
df3['league.winper']  = df3['league.gameswon']/df3['league.gamesplayed']
df3.head()

Unnamed: 0,xp,supporter,league.gamesplayed,league.gameswon,league.rating,league.bestrank,league.apm,league.pps,league.vs,league.winper
0,5236410.0,True,1692,1460,24999.120525,x,187.0,3.69,384.42,0.862884
1,15397682.5,True,416,391,24999.008863,x,167.03,2.95,343.59,0.939904
2,12230792.0,True,1153,784,24998.923556,x,193.84,3.83,399.41,0.679965
3,1274742.5,True,298,259,24998.088133,x,180.76,3.32,353.58,0.869128
4,2244814.5,False,550,403,24997.863237,x,169.96,3.89,352.08,0.732727


**Replace XP for Levels**

In [5]:
#Formula for levels found here https://tetrio.team2xh.net/?t=faq#xp

def calculate_level(xp):
    level = (xp/500)**0.6 + xp / (5000 + max(0, xp - 4*(10**6)) / 5000) + 1
    return math.floor(level)  
df3['level'] = df3['xp'].apply(calculate_level)
df3.drop('xp',axis='columns',inplace=True)
df3.head()

Unnamed: 0,supporter,league.gamesplayed,league.gameswon,league.rating,league.bestrank,league.apm,league.pps,league.vs,league.winper,level
0,True,1692,1460,24999.120525,x,187.0,3.69,384.42,0.862884,1257
1,True,416,391,24999.008863,x,167.03,2.95,343.59,0.939904,2609
2,True,1153,784,24998.923556,x,193.84,3.83,399.41,0.679965,2270
3,True,298,259,24998.088133,x,180.76,3.32,353.58,0.869128,366
4,False,550,403,24997.863237,x,169.96,3.89,352.08,0.732727,605


**Drop the the bottom 5% of people in games played. This is done to avoid players who have not played many games and may not be ranked as accurately**

In [6]:
df3['league.gamesplayed'].quantile(0.05,interpolation='linear')

25.0

In [7]:
df4 = df3[~(df3['league.gamesplayed']<=25)]
df4.shape

(40920, 10)

**Convert Support column to binary**

In [8]:
#Convert True/False columns to binary columns so model can interpret them
df4['supporter'] = df4['supporter'].astype(int)
df4.head()

Unnamed: 0,supporter,league.gamesplayed,league.gameswon,league.rating,league.bestrank,league.apm,league.pps,league.vs,league.winper,level
0,1,1692,1460,24999.120525,x,187.0,3.69,384.42,0.862884,1257
1,1,416,391,24999.008863,x,167.03,2.95,343.59,0.939904,2609
2,1,1153,784,24998.923556,x,193.84,3.83,399.41,0.679965,2270
3,1,298,259,24998.088133,x,180.76,3.32,353.58,0.869128,366
4,0,550,403,24997.863237,x,169.96,3.89,352.08,0.732727,605


<h2 style='color:blue'>Use One Hot Encoding For a Player's Best Rank</h2>

In [9]:
dummies = pd.get_dummies(df4['league.bestrank'])
df5 = pd.concat([df4.drop('league.bestrank',axis='columns'),dummies.drop('x',axis='columns')],axis='columns')
df5.columns

Index(['supporter', 'league.gamesplayed', 'league.gameswon', 'league.rating',
       'league.apm', 'league.pps', 'league.vs', 'league.winper', 'level', 'a',
       'a+', 'a-', 'b', 'b+', 'b-', 'c', 'c+', 'c-', 'd', 'd+', 's', 's+',
       's-', 'ss', 'u'],
      dtype='object')

<h2 style='color:blue'>Building the Prediction Model</h2>

**Creating the target variable (Y) and the input features (X)**

In [10]:
X = df5.drop(['league.rating'],axis='columns')
y = df5['league.rating']
X.head()

Unnamed: 0,supporter,league.gamesplayed,league.gameswon,league.apm,league.pps,league.vs,league.winper,level,a,a+,...,c,c+,c-,d,d+,s,s+,s-,ss,u
0,1,1692,1460,187.0,3.69,384.42,0.862884,1257,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,416,391,167.03,2.95,343.59,0.939904,2609,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1153,784,193.84,3.83,399.41,0.679965,2270,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,298,259,180.76,3.32,353.58,0.869128,366,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,550,403,169.96,3.89,352.08,0.732727,605,0,0,...,0,0,0,0,0,0,0,0,0,0


**Split the variables for 80% of the data be used for training and 20% be used for testing**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [12]:
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.9762170133274732

**Linear Regression gave a score over 97.5%, but I'd like to see if we can get any other ML model or other parameters that lead to higher accuracy. We will use GridSearchCV and K-fold cross validation to to do so.**

In [13]:
model_params =  {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
def find_best_model_using_gridsearchcv(X,y):
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in model_params.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_,
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.975186,{'normalize': True}
1,lasso,0.974985,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.968495,"{'criterion': 'mse', 'splitter': 'best'}"


**All 3 criteria have around the same score, so I will test with outliers to see which of the three handle them better**

In [14]:
lr_clf = LinearRegression(normalize=True)
lr_clf.fit(X,y)

LinearRegression(normalize=True)

In [15]:
lasso_clf = Lasso(alpha=1,selection='random')
lasso_clf.fit(X,y)

Lasso(alpha=1, selection='random')

In [16]:
dt_clf = DecisionTreeRegressor(criterion='mse',splitter='best')
dt_clf.fit(X,y)

DecisionTreeRegressor(criterion='mse')

<h2 style='color:blue'>Test the models with Outliers</h2>

In [17]:
def predict_TR(user):
    response = requests.get(f"https://ch.tetr.io/api/users/{user}")
    data_predict = response.json()
    df_predict1 = pd.DataFrame(data_predict)
    df_predict2 = pd.json_normalize(df_predict1['data'])
    
    x = pd.DataFrame(data=np.zeros((1, len(X_train.columns))), columns=X_train.columns)
    #print(x.columns)
    
    if (df_predict2['supporter_tier'].iloc[0]>0):
        x['supporter'] = 1
    else:
        x['supporter'] = 0
    
    x['league.gamesplayed'] = df_predict2['league.gamesplayed'].iloc[0]
    x['league.gameswon'] = df_predict2['league.gameswon'].iloc[0]
    x['league.apm'] = df_predict2['league.apm'].iloc[0]
    x['league.pps'] = df_predict2['league.pps'].iloc[0]
    x['league.vs'] = df_predict2['league.vs'].iloc[0]
    x['league.winper'] = df_predict2['league.gameswon'].iloc[0]/df_predict2['league.gamesplayed'].iloc[0]
    x['level'] = calculate_level(df_predict2['xp'].iloc[0])
    
    rank = df_predict2['league.bestrank'].iloc[0]
    if ((rank != 'x')):
        x[rank]=1
        
    lr_predicted = round(lr_clf.predict(x)[0],2)
    lasso_predicted = round(lasso_clf.predict(x)[0],2)
    dt_predicted = round(dt_clf.predict(x)[0],2)
    actual = round(df_predict2['league.rating'].iloc[0],2)
    # Assuming actual, lr_predicted, lasso_predicted, and dt_predicted are variables containing the values you want to print

    print(f"{'Actual TR:':<22} {actual}\n{'Linear Regression TR:':<22} {lr_predicted}\n{'Lasso TR:':<22} {lasso_predicted}\n{'Decision Tree TR:':<22} {dt_predicted}")


**Using players who are unrank and not part of the dataset**

In [18]:
predict_TR('icly')

Actual TR:             24999.33
Linear Regression TR:  34657.67
Lasso TR:              35409.42
Decision Tree TR:      24999.12


In [19]:
predict_TR('quickandsmart')

Actual TR:             24948.11
Linear Regression TR:  26133.71
Lasso TR:              26074.58
Decision Tree TR:      24918.61


In [20]:
predict_TR('kiken')

Actual TR:             267.08
Linear Regression TR:  -210.17
Lasso TR:              -178.76
Decision Tree TR:      518.96


In [21]:
predict_TR('atombolders')

Actual TR:             408.87
Linear Regression TR:  -859.76
Lasso TR:              -440.43
Decision Tree TR:      411.08


**Outlier predictions for Linear Regression and Lasso go outside the TR range (0-25,000), so Decision Tree TR will be used**

<h2 style='color:blue'>Export the model to a pickle file</h2>

In [22]:
import pickle
with open('tr_prediction_model.pickle','wb') as f:
    pickle.dump(dt_clf,f)

<h2 style='color:blue'>Export location and column information to a file that will be useful later on in prediction application</h2>


In [23]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))