In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
import quadratic_weighted_kappa
from sklearn.metrics import make_scorer
from sklearn.cross_validation import train_test_split
import feature_generator
from scipy.stats import pearsonr
from scipy import optimize
from XgBoost import XGBoostModel
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge

In [2]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')    

train, _, labels = feature_generator.GetFeatures(dfTrain, dfTest, 10, True)
metaFeatures = train.columns

#GetFeatures currently modifies dfTest, need to re-create them
dfTest = pd.read_csv('test.csv')    

Product_Info_2
Product_Info_3
Employment_Info_2
InsuredInfo_3
Medical_History_2
Medical_History_10
Scaling...


In [14]:
class StackingResults:
    
    def __init__(self, stackingModel, cutPoints, qwk):
        self.stackingModel = stackingModel
        self.cutPoints = cutPoints
        self.qwk = qwk

In [15]:
def calculateStackingResults(train_index, test_index, trainModelPredictions, testModelPredictions, featuresToUse):

    xTrain = trainModelPredictions[featuresToUse].values
    yTrain = labels.iloc[train_index]

    eta_list = [0.05] * 250 
    eta_list = eta_list + [0.02] * 450 
    stackingModel = LinearRegression()
    stackingModel.fit(xTrain, yTrain)

    xTest = testModelPredictions[featuresToUse].values
    yTest = labels.iloc[test_index]

    trainPredictions = stackingModel.predict(xTrain)
    predictions = stackingModel.predict(xTest)

    cutPoints = None
    cpo = CutPointOptimizer(trainPredictions, yTrain)
    cutPoints = optimize.fmin(cpo.qwk, initialCutPoints)

    trainPredictions = np.searchsorted(cutPoints, trainPredictions) + 1   
    predictions = np.searchsorted(cutPoints, predictions) + 1   

    testQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, yTest)
    trainQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(trainPredictions, yTrain)
 
    print "Test QWK: %s\n" % testQwk
    print "Train QWK: %s\n" % trainQwk
    
    return stackingModel, cutPoints, predictions, yTest

In [16]:
def GetBestModel(modelsToUse, metaFeatures):
    
    K = 3
    kf = KFold(len(train), K)
    
    bestQwk = -1
        
    meanTestQwk = 0
    meanTrainQwk = 0
    num = 1
    resultList = list()

    for train_index, test_index in kf:
        
        frames = [pd.read_csv('fold%s.csv' % str(i)) for i in range(1, K+1) if i != num  ]
        trainModelPredictions = pd.concat(frames)  
        testModelPredictions = pd.read_csv('fold%s.csv' % str(num))
    
        features = list(modelsToUse)
#         features = list()
#         for model in modelsToUse:
#             for metaFeature in metaFeatures:
#                 feature = 'Meta_%s_%s' % (model, metaFeature)
#                 features.append(feature)
#                 trainModelPredictions[feature] = trainModelPredictions[model].values * train.iloc[train_index][metaFeature].values
#                 testModelPredictions[feature] = testModelPredictions[model] * train.iloc[test_index][metaFeature].values
        
        stackingModel, cutPoints, predictions, YTest = calculateStackingResults(train_index, test_index, trainModelPredictions, testModelPredictions, features)        
        overallTestQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, YTest)
        
        result = StackingResults(stackingModel, cutPoints, overallTestQwk) 
        resultList.append(result)    
            
        num += 1

    return resultList

In [17]:
class CutPointOptimizer:
    
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual

    def qwk(self, cutPoints):
        transformedPredictions = np.searchsorted(cutPoints, self.predicted) + 1            
        return -1 * quadratic_weighted_kappa.quadratic_weighted_kappa(transformedPredictions, self.actual)

initialCutPoints = np.array([1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5])

In [18]:
modelsToUse = ['Keras', 'LogisticRegression', 'XGBoostRegLin', 'BaggingDescisionTrees_n_estimators=20', 'BaggingDescisionTreeClassifiers_n_estimators=20']
# modelsToUse = ['XGBoostRegLin']
#not using BaggingLinearRegression_n_estimators=10' currently
# modelsToUse = ['Keras']
# bestStackingModel, bestCutPoints, bestFold, _, _ = 
results = GetBestModel(modelsToUse, metaFeatures)

Optimization terminated successfully.
         Current function value: -0.658207
         Iterations: 137
         Function evaluations: 285
Test QWK: 0.646614922878

Train QWK: 0.658207145534

Optimization terminated successfully.
         Current function value: -0.660980
         Iterations: 175
         Function evaluations: 331
Test QWK: 0.656595602754

Train QWK: 0.660979586119

Optimization terminated successfully.
         Current function value: -0.651304
         Iterations: 138
         Function evaluations: 263
Test QWK: 0.660876911873

Train QWK: 0.651303726608



In [44]:
def WritePredictionsToFile(results, modelsToUse, dfTest, fileName):
               
        predictions = np.zeros(len(dfTest))
        
        for i in range(1, len(results)+1):
            X = pd.read_csv('testPredictions%s.csv' % str(i))[modelsToUse]
            result = results[i-1]
            print result.qwk
            predictions += = result.qwk * (np.searchsorted(result.cutPoints, result.stackingModel.predict(X)) + 1)
            
        totalWeight = np.sum([result.qwk for result in results])
        predictions /= totalWeight
        
        print predictions
            
        
        predDf = pd.DataFrame()
        predDf['Id'] = dfTest['Id']
        predDf['Response'] = predictions
        predDf['Response'] = predDf['Response'].astype(int)
        print predDf['Response'].values
        predDf.to_csv(path_or_buf=fileName, columns=['Id', 'Response'], index=False, header=['Id', 'Response'])

In [46]:
WritePredictionsToFile(results, modelsToUse, dfTest, 'cutPointsStacking.csv', bestFold)

KeyError: "['XGBoostLinear'] not in index"

In [11]:
fold1 = pd.read_csv('fold1.csv')

for i in range(len(fold1.columns)):
    for j in range(i + 1, len(fold1.columns)):        
        print "%s, %s, %s" % (fold1.columns[i], fold1.columns[j], pearsonr(fold1[fold1.columns[i]], fold1[fold1.columns[j]]))

BaggingDescisionTrees_n_estimators=20, BaggingLinearRegression_n_estimators=10, (0.82723223060018414, 0.0)
BaggingDescisionTrees_n_estimators=20, LogisticRegression, (0.71672697762552096, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoost, (0.90545269754757352, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostLinear, (0.82816721211796251, 0.0)
BaggingDescisionTrees_n_estimators=20, Keras100/50Layers6Epochs, (0.81860775406944197, 0.0)
BaggingDescisionTrees_n_estimators=20, Keras, (0.81700780178490495, 0.0)
BaggingDescisionTrees_n_estimators=20, Index, (0.0040305074970553995, 0.57069839828140645)
BaggingDescisionTrees_n_estimators=20, IndexNum, (0.0040305074970553995, 0.57069839828140645)
BaggingDescisionTrees_n_estimators=20, LinXGBoost, (0.9073296629252775, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostKappa, (0.67522197855644062, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostRegLin, (0.91606803512950197, 0.0)
BaggingDescisionTrees_n_estimators=20, BaggingDescisionTreeClas

In [41]:
K=3
num = 1
frames = [pd.read_csv('fold%s.csv' % str(i)) for i in range(1, K+1) if i != num  ]
testModelPredictions = pd.read_csv('fold%s.csv' % str(num))

features = list()
for model in modelsToUse:
    for metaFeature in metaFeatures:
        feature = 'Meta_%s_%s' % (model, metaFeature)
        features.append(feature)
        testModelPredictions[feature] = testModelPredictions[model] * dfTrain[metaFeature]
        print foldDf[model][0]
        print dfTrain[metaFeature][0]
        print dfTrain[metaFeature][20000]
        print testModelPredictions[feature][0]
#         print testModelPredictions[feature][20000]
#         print foldDf[model]
        break
    break

7.85559797287
0.641791045
0.313432836
5.04165243211
