In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
import quadratic_weighted_kappa
from sklearn.metrics import make_scorer
from sklearn.cross_validation import train_test_split
import feature_generator
from scipy.stats import pearsonr
from scipy import optimize

In [2]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')    

In [11]:
def GetBestModel(modelsToUse):
    
    K = 3
    kf = KFold(len(dfTrain), K)
    
    bestQwk = -1
    bestCutPoints = None
    bestModel = None
    
    meanTestQwk = 0
    meanTrainQwk = 0
    num = 1

    for train_index, test_index in kf:
        
        frames = [pd.read_csv('fold%s.csv' % str(i)) for i in range(1, K+1) if i != num  ]
        trainModelPredictions = pd.concat(frames)  
        
        xTrain = trainModelPredictions[modelsToUse].values
        yTrain = dfTrain.iloc[train_index]['Response']
                
        stackingModel = LinearRegression()
        stackingModel.fit(xTrain, yTrain)

        xTest = pd.read_csv('fold%s.csv' % str(num))[modelsToUse].values
        yTest = dfTrain.iloc[test_index]['Response']
        
        trainPredictions = stackingModel.predict(xTrain)
        predictions = stackingModel.predict(xTest)

#         cpo = CutPointOptimizer(trainPredictions, yTrain)
#         cutPoints = optimize.fmin(cpo.qwk, initialCutPoints)
 
#         trainPredictions = np.searchsorted(cutPoints, trainPredictions) + 1   
#         predictions = np.searchsorted(cutPoints, predictions) + 1   
    
#         noStack = quadratic_weighted_kappa.quadratic_weighted_kappa(xTest, yTest)
        testQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, yTest)
        trainQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(trainPredictions, yTrain)

#         print "No Stack: %s\n" % noStack
        print "TestQWK: %s\n" % testQwk
        print "Train QWK: %s\n" % trainQwk
        
        meanTestQwk += (testQwk / K)
        meanTrainQwk += (trainQwk / K)
        
        if testQwk > bestQwk:
            bestQwk= testQwk
            bestModel = stackingModel  
#             bestCutPoints = cutPoints
            
        num += 1

    return bestModel, bestCutPoints, meanTestQwk, meanTrainQwk

In [10]:
modelsToUse = ['LogisticRegression', 'Keras100/50Layers6Epochs', 'XGBoost', 'BaggingDescisionTrees_n_estimators=20']
#not using BaggingLinearRegression_n_estimators=10' currently
# modelsToUse = ['XGBoost']
bestStackingModel, bestCutPoints, _, _ = GetBestModel(modelsToUse)

No Stack: 0.646270626345

TestQWK: 0.585221211541

Train QWK: 0.590117036476

No Stack: 0.639844414905

TestQWK: 0.589867138885

Train QWK: 0.58775413105

No Stack: 0.649820597261

TestQWK: 0.562309941777

Train QWK: 0.562594123404



In [14]:
def WritePredictionsToFile(stackingModel, cutPoints, modelsToUse, dfTest, fileName):
        X = pd.read_csv('testPredictions.csv')[modelsToUse].values
        predictions = np.searchsorted(cutPoints, stackingModel.predict(X)) + 1  
        predDf = pd.DataFrame()
        predDf['Id'] = dfTest['Id']
        predDf['Response'] = predictions
        print predictions
        predDf.to_csv(path_or_buf=fileName, columns=['Id', 'Response'], index=False, header=['Id', 'Response'])

In [16]:
WritePredictionsToFile(bestStackingModel, bestCutPoints, modelsToUse, dfTest, 'cutPointsStacking.csv')

[3 6 7 ..., 4 1 3]


In [12]:
fold1 = pd.read_csv('fold1.csv')

for i in range(len(fold1.columns)):
    for j in range(i + 1, len(fold1.columns)):        
        print "%s, %s, %s" % (fold1.columns[i], fold1.columns[j], pearsonr(fold1[fold1.columns[i]], fold1[fold1.columns[j]]))

BaggingDescisionTrees_n_estimators=20, BaggingLinearRegression_n_estimators=10, (0.83540238769065311, 0.0)
BaggingDescisionTrees_n_estimators=20, LogisticRegression, (0.71984046254126033, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoost, (0.90723913925647082, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostLinear, (0.89283334232602007, 0.0)
BaggingDescisionTrees_n_estimators=20, Keras100/50Layers6Epochs, (0.82664006360441522, 0.0)
BaggingLinearRegression_n_estimators=10, LogisticRegression, (0.79807322775143286, 0.0)
BaggingLinearRegression_n_estimators=10, XGBoost, (0.91090304621084206, 0.0)
BaggingLinearRegression_n_estimators=10, XGBoostLinear, (0.88238511312863521, 0.0)
BaggingLinearRegression_n_estimators=10, Keras100/50Layers6Epochs, (0.93524853166482902, 0.0)
LogisticRegression, XGBoost, (0.78100656315711203, 0.0)
LogisticRegression, XGBoostLinear, (0.76372491674735887, 0.0)
LogisticRegression, Keras100/50Layers6Epochs, (0.84257605054523654, 0.0)
XGBoost, XGBoostLinear, (0

In [10]:


# def qwk(cutPoints):
#     transformedPredictions = np.searchsorted(cutPoints, predicted) + 1            
#     return -1 * quadratic_weighted_kappa.quadratic_weighted_kappa(transformedPredictions, actual)

initialCutPoints = np.array([1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5])
print initialCutPoints
# optimize.fmin(test, )

[ 1.5  2.5  3.5  4.5  5.5  6.5  7.5]
