In [7]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance
from catboost import Pool, CatBoostClassifier

In [24]:
train = pd.read_csv('dataset/development_dataset.csv')
test = pd.read_csv('dataset/leaderboard_dataset.csv')

In [9]:
train['VAR14'] = train['VAR14'].replace('.',1)
train['VAR21'] = train['VAR21'].replace({'Low','Medium','High'},(0,1,2))

In [10]:
train['VARS1'] =((train['VAR7'] - train['VAR16']).abs())
test['VARS1'] = ((test['VAR7'] - test['VAR16']).abs())

In [11]:
columns = ['VAR15','VAR18','VAR7','VAR1']
train = train.drop(columns, axis=1)
test = test.drop(columns,axis=1)

In [12]:
train['VAR14']=train['VAR14'].astype(str).astype(int)
train['VAR21']=train['VAR21'].astype(str).astype(int)

In [13]:
X = train.drop(['VAR21'],axis=1)
Y = train['VAR21']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=25)

In [15]:
from sklearn.base import clone


class OrdinalClassifier():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y][:,1])
            elif y in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[y-1][:,1])
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


In [16]:
model = OrdinalClassifier(XGBClassifier(max_depth=9,learning_rate=0.1, gamma=5, min_child_weight=7))

In [17]:
model.fit(X,Y)

In [18]:
preds = model.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, preds, labels=[0, 1, 2])

array([[3267,  325,    8],
       [1152, 1257,    7],
       [ 467,  183,  134]], dtype=int64)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, preds)

0.685

In [21]:
test['VAR14'] = test['VAR14'].replace('.',1)
test['VAR14']=test['VAR14'].astype(str).astype(int)

In [22]:
prediction = model.predict(test)

In [25]:
output = pd.DataFrame()
output['VAR1'] = test['VAR1']
output['VAR21']=prediction

In [26]:
output['VAR21'] = output['VAR21'].replace((0,1,2),{'Low','Medium','High'})

In [27]:
output.to_csv(r'Neil_hanna_IITKGP_1.csv', header = None, index=False)