# Rating and Rank Predictor

- Rating is a number given to each user based on how they perform in recent contests. This is a regression problem. We used linear regression, XGboost and random forest.
- Rank is a class given to each user like "Grand Master", "Expert", "Pupil" etc. This is a classification problem. We used logistic regression, XGboost and random forest.

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBRegressor,XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('./data/final.csv')

In [3]:
data.describe()

Unnamed: 0,contribution,contest_count,rating,max_rating,friends_count,duration,problem_count,avg_difficulty,avg_difficulty20,median,...,difficulty_bucket3,difficulty_bucket4,difficulty_bucket5,difficulty_bucket6,difficulty_bucket7,difficulty_bucket8,difficulty_bucket9,difficulty_bucket10,difficulty_bucket11,difficulty_bucket12
count,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,...,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0,6441.0
mean,0.648968,20.576619,1440.101848,1556.414687,28.782487,1073.935569,127.123428,1206.016621,1629.946308,1181.183046,...,18.19143,15.219531,7.868964,4.616053,2.162552,1.292656,0.574755,0.15215,0.0,0.022667
std,5.745607,20.917052,232.426607,182.51725,70.549313,657.743632,154.928163,167.440872,282.403082,177.788077,...,24.797218,24.712344,16.140394,13.114247,7.917106,5.984573,2.908497,1.113085,0.0,0.259805
min,-43.0,1.0,603.0,1366.0,0.0,15.0,1.0,500.0,500.0,500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.0,1287.0,1413.0,4.0,547.0,25.0,1092.592593,1443.589744,1100.0,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,14.0,1414.0,1488.0,11.0,1004.0,71.0,1192.391304,1607.407407,1200.0,...,8.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,28.0,1581.0,1659.0,29.0,1547.0,171.0,1301.111111,1794.444444,1300.0,...,26.0,19.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0
max,151.0,163.0,2500.0,2578.0,1657.0,17853.0,1611.0,2013.186813,2716.666667,2000.0,...,233.0,300.0,232.0,201.0,162.0,132.0,81.0,26.0,0.0,8.0


In [4]:
data.head()

Unnamed: 0,country,handle,contribution,contest_count,organization,rating,max_rating,friends_count,rank,duration,...,difficulty_bucket3,difficulty_bucket4,difficulty_bucket5,difficulty_bucket6,difficulty_bucket7,difficulty_bucket8,difficulty_bucket9,difficulty_bucket10,difficulty_bucket11,difficulty_bucket12
0,India,Sumeet.Varma,1,46,DA-IICT,2500,2515,1657,grandmaster,1416,...,89,106,113,82,63,45,13,10,0,0
1,India,akshay_miterani,1,72,DA-IICT,2052,2052,390,candidate master,1291,...,96,79,34,15,14,4,3,1,0,0
2,India,kuldeeppatel,0,37,DA-IICT,2030,2030,208,candidate master,1577,...,37,26,18,6,4,1,0,0,0,0
3,India,tanmay273,0,90,DA-IICT,2006,2006,290,candidate master,1539,...,114,128,86,65,49,21,7,1,0,1
4,India,Hiren.Vaghela,0,32,DA-IICT,1997,1997,170,candidate master,543,...,92,117,89,70,34,24,11,2,0,0


In [5]:
class Predictor:
    def __init__(self,data):
        self.data=data
        
        # One hot encoding or organization data
        self.data = pd.concat([self.data, pd.get_dummies(self.data["organization"],prefix='organization')], axis = 1)
        self.data.drop(["organization"], axis = 1, inplace = True)
        
        self.data = shuffle(self.data) # Shuffling data
        
        # Selecting features
        data_columns = list(self.data.columns)
        data_columns.remove("rating") # We want to predict this!
        data_columns.remove("country") # Because we are working only with Indian users 
        data_columns.remove("handle") # Identifier
        data_columns.remove("max_rating") # We want to predict this!
        data_columns.remove("rank") # We want to predict this!
        
        train, test = train_test_split(self.data, test_size = 0.3)
        
        self.x_train = train[data_columns]
        self.x_test = test[data_columns]
        
        self.y_train_rating = train["max_rating"]
        self.y_test_rating = test["max_rating"]
        self.y_train_rank = train["rank"]
        self.y_test_rank = test["rank"]
    
    def predictRatings(self):
        
        print("Rating Predictions (Rating)")
        
        self._linearRegressorRating()
        self._XGBRating()
        self._RandomForestRegressorRating()
    
    def predictRanks(self):
        print("Rank Predictions (Classification)")
        
        self._logisticRegressorRank()
        self._XGBRank()
        self._RandomForestClassifierRank()
        
    
    def _linearRegressorRating(self):
        print("<-- Linear Regression Rating Predictor -->") 
        m = LinearRegression().fit(self.x_train, self.y_train_rating)
        predictions = m.predict(self.x_test).reshape(-1, 1)
        print("R2 Score:",m.score(self.x_test, self.y_test_rating))
        print("RMS Error:",mean_squared_error(predictions,self.y_test_rating)**0.5)
        print()
    
    def _logisticRegressorRank(self):
        print("<-- Logistic Regression Rank Predictor -->") 
        m = LogisticRegression(C=5.5,max_iter=1000).fit(self.x_test, self.y_test_rank)
        predictions = m.predict(self.x_test).reshape(-1, 1)
        print("Mean Accuracy:",m.score(self.x_test, self.y_test_rank))
        print()
    
    def _RandomForestClassifierRank(self):
        print("<-- Random Forest Rank Predictor -->") 
        m = RandomForestClassifier(n_estimators = 1200, min_samples_leaf = 6, n_jobs = -1, verbose = 0).fit(self.x_train, self.y_train_rank)
        predictions = m.predict(self.x_test).reshape(-1, 1)
        print("Mean Accuracy:",m.score(self.x_test, self.y_test_rank))
        print()
    
    def _RandomForestRegressorRating(self):
        print("<-- Random Forest Rating Predictor -->") 
        m = RandomForestRegressor(n_estimators = 1200, min_samples_leaf = 6, n_jobs = -1, verbose = 0).fit(self.x_train, self.y_train_rating)
        predictions = m.predict(self.x_test).reshape(-1, 1)
        print("R2 Score:",m.score(self.x_test, self.y_test_rating))
        print("RMS Error:",mean_squared_error(predictions,self.y_test_rating)**0.5)
        print()
    
    def _XGBRating(self):
        print("<-- XG Boost Rating Predictor -->") 
        m = XGBRegressor(n_estimators=200).fit(self.x_train, self.y_train_rating)
        predictions = m.predict(self.x_test).reshape(-1, 1)
        print("R2 Score:",r2_score(self.y_test_rating,predictions))
        print("RMS Error:",mean_squared_error(predictions,self.y_test_rating)**0.5)
        print()
    
    def _XGBRank(self):
        print("<-- XG Boost Rank Predictor -->") 
        m = XGBClassifier(n_estimators=200).fit(self.x_train, self.y_train_rank)
        predictions = m.predict(self.x_test).reshape(-1, 1)
        print("Mean Score:",accuracy_score(self.y_test_rank,predictions))
        print()
        
        

In [6]:
model = Predictor(data)

model.predictRatings()
model.predictRanks()


Rating Predictions (Rating)
<-- Linear Regression Rating Predictor -->
R2 Score: 0.7383121398361376
RMS Error: 92.45847099044312

<-- XG Boost Rating Predictor -->
R2 Score: 0.8272569205703069
RMS Error: 75.11994267008765

<-- Random Forest Rating Predictor -->
R2 Score: 0.8030317835635492
RMS Error: 80.21452115865286

Rank Predictions (Classification)
<-- Logistic Regression Rank Predictor -->
Mean Accuracy: 0.6047594412829799

<-- XG Boost Rank Predictor -->


  if diff:


Mean Score: 0.5757889291257113

<-- Random Forest Rank Predictor -->
Mean Accuracy: 0.55406104500776

