In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, LeavePOut
from scipy.stats import spearmanr
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
df = pd.DataFrame.from_csv("all_features.csv")

In [3]:
param = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.9, 'colsample_bytree': 0.8,  
         'objective':'rank:pairwise'}

In [4]:
param_bin = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.9, 'colsample_bytree': 0.8,  
         'objective':'binary:logistic'}

In [5]:
def cv(df, folds=20):
    scores = []
    kf = KFold(folds)
    label_column = df["label"]
    features = df.drop(["label", "dialogId", "user"], axis=1)
    
    for train_index, test_index in kf.split(df):
        feat_train, feat_test = features.values[train_index], features.values[test_index]
        label_train, label_test = label_column.values[train_index], label_column.values[test_index]
        
        dtrain = xgb.DMatrix(feat_train, label_train, feature_names=features.columns)
        bst = xgb.train(param, dtrain, num_boost_round=150)
        
        dtest = xgb.DMatrix(feat_test, feature_names=features.columns)
        preds = bst.predict(dtest)
        
        scores.append(spearmanr(preds,label_test).correlation)
        print scores[-1]
    
    return np.mean(scores)

In [6]:
def reg_to_binary(df):
    dfa = df.copy()
    dfa["merge"] = 1
    large_df = pd.merge(dfa,dfa,on="merge")
    large_df["label"] = large_df["label_x"] > large_df["label_y"]
    large_df = large_df.drop(["label_x","label_y"], axis=1)
    
    return large_df

In [7]:
def fit(df_train):
    label_column = df_train["label"]
    features = df_train.drop(["label","dialogId_x","dialogId_y","user_x","user_y"], axis=1)
    
    dtrain = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)
    bst = xgb.train(param_bin, dtrain, num_boost_round=150)
    
    return bst

In [15]:
def to_order(df, bst):
    features = df.drop(["label","dialogId_x","dialogId_y","user_x","user_y"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    df["predict"] = bst.predict(dtest)
    counter = Counter()
    
    for i in range(0, len(df)):
        row = df.iloc[i]
        counter[(row["dialogId_x"],row["user_x"])] += row["predict"]
    
    return [x[0] for x in counter.most_common(len(counter))]

In [16]:
def order_to_scores(pairs):
    records = []
    for i,(d,u) in enumerate(pairs):
        records.append((d,u,i))
    
    return pd.DataFrame.from_records(records, columns=["dialogId","user","score"])

In [17]:
train_df, test_df = train_test_split(df)

In [None]:
bst = fit(reg_to_binary(train_df[:600]))

In [None]:
order = to_order(reg_to_binary(test_df[:200]), bst)

In [None]:
res = order_to_scores(order)

In [None]:
mg = pd.merge(res,test_df, on=["dialogId","user"])

In [None]:
spearmanr(mg["label"],mg["score"])

In [29]:
len(test_df)

429