In [34]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
import quadratic_weighted_kappa
from sklearn.metrics import make_scorer
from sklearn.cross_validation import train_test_split
import feature_generator
from scipy.stats import pearsonr

In [3]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')    

In [31]:
def GetBestModel(modelsToUse):
    
    K = 3
    bestQwk = -1
    bestModel = None
    kf = KFold(len(dfTrain), K)
    meanTestQwk = 0
    meanTrainQwk = 0
    num = 1

    for train_index, test_index in kf:
        
        frames = [pd.read_csv('fold%s.csv' % str(i)) for i in range(1, K+1) if i != num  ]
        trainModelPredictions = pd.concat(frames)  
        
        xTrain = trainModelPredictions[modelsToUse].values
        yTrain = dfTrain.iloc[train_index]['Response']
                
        model = LinearRegression()
        model.fit(xTrain, yTrain)

        xTest = pd.read_csv('fold%s.csv' % str(num))[modelsToUse].values
        yTest = dfTrain.iloc[test_index]['Response']

        predictions = np.clip(np.rint(model.predict(xTest)).astype(int), 1, 8)
        trainPredictions = np.clip(np.rint(model.predict(xTrain)).astype(int), 1, 8)
        
        testQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, yTest)
        trainQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(trainPredictions, yTrain)
        
        print "TestQWK: %s\n" % testQwk
        print "Train QWK: %s\n" % trainQwk
        
        meanTestQwk += (testQwk / K)
        meanTrainQwk += (trainQwk / K)
        
        if testQwk > bestQwk:
            bestQwk= testQwk
            bestModel = model  
            
        num += 1

    return bestModel, meanTestQwk, meanTrainQwk

In [44]:
modelsToUse = ['LogisticRegression', 'BaggingLinearRegression_n_estimators=10', 'BaggingDescisionTres_n_estimators=20']
model, _, _ = GetBestModel(modelsToUse)

TestQWK: 0.587738269752

Train QWK: 0.590360710016

TestQWK: 0.58582486302

Train QWK: 0.591686995835

TestQWK: 0.595065609256

Train QWK: 0.586881242969



In [45]:
print model.coef_
print model.intercept_

[ 0.07809293  0.44807893  0.52070013]
-0.303474966696


In [39]:
fold1 = pd.read_csv('fold1.csv')

for i in range(len(fold1.columns)):
    for j in range(i + 1, len(fold1.columns)):        
        print "%s, %s, %s" % (fold1.columns[i], fold1.columns[j], pearsonr(fold1[fold1.columns[i]], fold1[fold1.columns[j]]))

LogisticRegression, LinearRegression, (0.80610295078124594, 0.0)
LogisticRegression, BaggingLinearRegression_n_estimators=10, (0.80567055253887121, 0.0)
LogisticRegression, BaggingDescisionTres_n_estimators=20, (0.70481985974407857, 0.0)
LogisticRegression, SVC_C=20_g=0.05, (0.42792111204520739, 0.0)
LinearRegression, BaggingLinearRegression_n_estimators=10, (0.99958321215324886, 0.0)
LinearRegression, BaggingDescisionTres_n_estimators=20, (0.81975736915972996, 0.0)
LinearRegression, SVC_C=20_g=0.05, (0.50428200271180568, 0.0)
BaggingLinearRegression_n_estimators=10, BaggingDescisionTres_n_estimators=20, (0.81938633641483039, 0.0)
BaggingLinearRegression_n_estimators=10, SVC_C=20_g=0.05, (0.5031677152634354, 0.0)
BaggingDescisionTres_n_estimators=20, SVC_C=20_g=0.05, (0.46936555715234435, 0.0)
