In [112]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, LeavePOut
from scipy.stats import spearmanr
from collections import Counter
from sklearn.model_selection import train_test_split

In [121]:
df = pd.DataFrame.from_csv("all_feats_27.csv").reset_index()

In [123]:
param = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.9, 'colsample_bytree': 0.8,  
         'objective':'rank:pairwise'}

In [124]:
param_bin = {'eta':0.1, 'max_depth':3, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.9, 'colsample_bytree': 0.8,  
         'objective':'binary:logistic'}

In [129]:
def cv(df, folds=20):
    scores = []
    kf = KFold(folds)
    label_column = df["label"]
    features = df.drop(["label", "dialogId", "user"], axis=1)
    
    allpreds = []
    
    for train_index, test_index in kf.split(df):
        feat_train, feat_test = features.values[train_index], features.values[test_index]
        label_train, label_test = label_column.values[train_index], label_column.values[test_index]
        
        dtrain = xgb.DMatrix(feat_train, label_train, feature_names=features.columns)
        bst = xgb.train(param, dtrain, num_boost_round=150)
        
        dtest = xgb.DMatrix(feat_test, feature_names=features.columns)
        preds = bst.predict(dtest)
        
        allpreds.extend(preds)
        
        scores.append(spearmanr(preds,label_test).correlation)
        print scores[-1]
    
    df["prediction"] = allpreds
    
    df.to_csv("all_predictions_lesha.csv",index=False)
    print df
    
    return np.mean(scores)

In [130]:
cv(df, folds=5)

0.565510657057
0.77977909682
0.716644255645
0.698580765357
0.769466055222
        dialogId  label  other_avg_index_lemma  other_avg_words  \
0     -315877751      1               9.000000              2.0   
1     -315877751      4                    NaN              0.0   
2    -1063204396      1                    NaN              0.0   
3    -1063204396      1               0.000000              2.0   
4     -155769874      0               4.800000              2.0   
5     -155769874      1               1.500000              2.0   
6     1327080259      0               5.800000              2.0   
7     1327080259      3               0.500000              2.0   
8    -1232588906      1                    NaN              0.0   
9    -1232588906      0                    NaN              0.0   
10    1479159704      1                    NaN              0.0   
11    1479159704      0                    NaN              0.0   
12    1220033538      1               5.000000         

0.70599616602003468

In [91]:
def reg_to_binary(df):
    dfa = df.copy()
    dfa["merge"] = 1
    large_df = pd.merge(dfa,dfa,on="merge")
    large_df["label"] = large_df["label_x"] > large_df["label_y"]
    large_df = large_df.drop(["label_x","label_y"], axis=1)
    
    return large_df

In [92]:
def fit(df_train):
    label_column = df_train["label"]
    features = df_train.drop(["label","dialogId_x","dialogId_y","user_x","user_y"], axis=1)
    
    dtrain = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)
    bst = xgb.train(param_bin, dtrain, num_boost_round=150)
    
    return bst

In [108]:
def to_order(df, bst):
    features = df.drop(["label","dialogId_x","dialogId_y","user_x","user_y"], axis=1)
    dtest = xgb.DMatrix(features.values, feature_names=features.columns)
    df["predict"] = bst.predict(dtest)
    counter = Counter()
    print df
    for tuples in df.itertuples():
        print tuples
#         for i in range(0,len(tuples),3):
#             print (tuples[i],tuples[i+1],tuples[i+2])
#             dialog_id,user,predict = (tuples[i],tuples[i+1],tuples[i+2])
#             counter[(dialog_id,user)] += int(predict)
    
    return [x[0] for x in counter.most_common(len(counter))]

In [109]:
def order_to_scores(pairs):
    records = []
    for i,(d,u) in enumerate(pairs):
        records.append((d,u,i))
    
    return pd.DataFrame.from_records(records, columns=["dialogId","user","score"])

In [95]:
train_df, test_df = train_test_split(df)

In [99]:
bst = fit(reg_to_binary(train_df[:30]))

In [110]:
order = to_order(reg_to_binary(test_df[:10]), bst)

    dialogId_x  other_avg_index_lemma_x  other_avg_words_x  \
0   2012662278                      NaN                0.0   
1   2012662278                      NaN                0.0   
2   2012662278                      NaN                0.0   
3   2012662278                      NaN                0.0   
4   2012662278                      NaN                0.0   
5   2012662278                      NaN                0.0   
6   2012662278                      NaN                0.0   
7   2012662278                      NaN                0.0   
8   2012662278                      NaN                0.0   
9   2012662278                      NaN                0.0   
10 -1307873641                 1.090909                2.0   
11 -1307873641                 1.090909                2.0   
12 -1307873641                 1.090909                2.0   
13 -1307873641                 1.090909                2.0   
14 -1307873641                 1.090909                2.0   
15 -1307

(15, -1307873641, 1.0909090909100001, 2.0, 0.0, -10.327094000000001, 209.04332500000001, 14.090909090899999, 11.0, 1.0909090909100001, -0.079871103167499999, 0.27669459581400002, -0.26422491669699999, -0.104628667235, 0.128043368459, -0.019678801298099999, -0.095828875899300003, 0.16791348159299999, -0.36183345317800003, 0.072167515754699998, -0.104208216071, -0.13553492724899999, 0.12329145520899999, 0.099823385477100013, 0.042936690151699992, -0.32094264030499997, 0.24846439063499998, -0.16860184073399997, 0.073478341102600001, 0.069877609610600008, -0.021814176812800002, -0.11254690587500001, -0.06801911443469999, -0.081619508564499996, 0.143074184656, 0.10316387563899999, -0.118234731257, 0.062131226062799999, -0.036382745951400002, 0.085331246256799997, 0.12589207291599999, -0.0061596860177799998, -0.25533106923099996, 0.13343065977099999, -0.14473047852499998, -0.00903566926718, 0.050561886280800002, -0.07463023811580001, -0.10211473703400001, 0.10386242717499999, 0.160839572549,

In [None]:
res = order_to_scores(order)

In [None]:
mg = pd.merge(res,test_df, on=["dialogId","user"])

In [15]:
spearmanr(mg["label"],mg["score"])

SpearmanrResult(correlation=-0.32943739708057396, pvalue=0.35260521591058591)

In [29]:
len(test_df)

429

In [105]:
test_df[:10]

Unnamed: 0,dialogId,label,other_avg_index_lemma,other_avg_words,other_context_cosine,other_ngram_logprob,other_ngram_ppl,other_no_vocab_tokens,other_phrases_cnt,other_topN_count,...,self_w2v_92,self_w2v_93,self_w2v_94,self_w2v_95,self_w2v_96,self_w2v_97,self_w2v_98,self_w2v_99,self_words_cnt,user
354,2012662278,0,,0.0,,,,,0.0,,...,,,,,,,,,2.0,Bob
975,-1307873641,0,1.090909,2.0,0.0,-10.327094,209.043325,14.090909,11.0,1.090909,...,-0.061068,0.010745,0.128771,-0.016783,-0.135848,-0.100278,0.028226,-0.008747,20.0,Alice
1599,1429171428,0,0.0,3.0,0.378163,-2.934429,859.8623,18.666667,3.0,0.0,...,-0.106024,0.15431,0.001265,-0.022759,-0.054278,-0.127895,0.037324,-0.017102,9.0,Alice
546,-1959800315,1,4.0,2.0,0.853302,-34.68815,2943.165,14.0,2.0,4.0,...,-0.016944,0.02312,0.132518,-0.041844,-0.059754,-0.11939,0.012231,0.012817,4.0,Bob
390,-138413681,1,6.333333,2.0,0.64889,-14.444387,527.73453,14.333333,3.0,6.333333,...,0.016515,0.241768,0.030048,-0.012269,-0.363328,-0.242339,-0.086872,-0.006948,2.0,Bob
85,165128883,3,6.285714,2.0,0.801944,-19.403645,11021.595376,14.142857,7.0,6.285714,...,0.024973,0.1088,0.100289,-0.088209,0.016817,0.011519,-0.088833,-0.037486,18.0,Alice
1274,1090064881,1,2.2,2.0,,-12.83737,12690.42515,13.6,5.0,2.2,...,,,,,,,,,0.0,Bob
1439,-358065840,2,3.6,3.0,0.871206,-12.139264,701.8522,18.4,5.0,3.6,...,-0.05896,0.053519,0.053068,-0.024408,-0.09762,-0.142954,0.008776,0.080267,15.0,Alice
256,-1170545435,3,3.545455,2.0,0.82039,-13.025816,2194.713433,14.363636,11.0,3.545455,...,-0.04187,-0.005089,0.118866,0.037522,-0.025495,-0.098351,-0.07956,0.072137,16.0,Bob
691,939751164,0,,0.0,,,,,0.0,,...,,,,,,,,,6.0,Alice
