In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
import quadratic_weighted_kappa
from sklearn.metrics import make_scorer
from sklearn.cross_validation import train_test_split
import feature_generator
from scipy.stats import pearsonr
from scipy import optimize
from XgBoost import XGBoostModel
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge
from CutPoints import CutPointOptimizer
from NN import NN

Using Theano backend.


In [2]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')    

train, _, labels = feature_generator.GetFeatures(dfTrain, dfTest, 100, True)
metaFeatures = train.columns

#GetFeatures currently modifies dfTest, need to re-create them
dfTest = pd.read_csv('test.csv')    

Product_Info_2
Product_Info_3
Employment_Info_2
InsuredInfo_3
Medical_History_2
Medical_History_10
Scaling...


In [3]:
class StackingResults:
    
    def __init__(self, stackingModel, cutPoints, qwk):
        self.stackingModel = stackingModel
        self.cutPoints = cutPoints
        self.qwk = qwk

In [4]:
def calculateStackingResults(xTrain, yTrain, xTest, yTest):
    stackingModel = LinearRegression()
    print xTrain.shape
    stackingModel.fit(xTrain, yTrain)

    trainPredictions = stackingModel.predict(xTrain)
    predictions = stackingModel.predict(xTest)

    initialCutPoints = np.array([1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5])
    cpo = CutPointOptimizer(trainPredictions, yTrain)
    cutPoints = optimize.fmin(cpo.qwk, initialCutPoints)

    trainPredictions = np.searchsorted(cutPoints, trainPredictions) + 1   
    predictions = np.searchsorted(cutPoints, predictions) + 1   

    testQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, yTest)
    trainQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(trainPredictions, yTrain)
 
    print "Test QWK: %s\n" % testQwk
    print "Train QWK: %s\n" % trainQwk
    
    return stackingModel, cutPoints, predictions, yTest

In [5]:
def GetBestModel(modelsToUse, metaFeatures):
    
    K = 5
    kf = KFold(len(train), K)
    indices = [indices for indices in kf]
    train_indices = [index[0] for index in indices]
    test_indices = [index[1] for index in indices]
        
    meanTestQwk = 0
    meanTrainQwk = 0
    num = 1
    resultList = list()

    for i in range(1, K+1):
        
        trainFile = 'combinedTrainPredictions%s.csv' % str(num)
        testFile = 'combinedTestPredictions%s.csv' % str(num)

        trainDF = pd.read_csv(trainFile) if os.path.isfile(trainFile) else pd.DataFrame()  
        testDF = pd.read_csv(testFile) if os.path.isfile(testFile) else pd.DataFrame()  
    
        features = list(modelsToUse)
#         features = list()
#         for j in range(len(modelsToUse)):
#             for  k in range(len(modelsToUse)):
#                 model1 = modelsToUse[j]
#                 model2 = modelsToUse[k]
#                 if model1 == model2:
#                     continue
#                 feature = 'Meta_%s_%s' % (model1, model2)
#                 features.append(feature)
#                 fold[feature] = fold[model1].values * fold[model2].values
 
#         features = list()
#         for model in modelsToUse:
#             for metaFeature in metaFeatures:
#                 feature = 'Meta_%s_%s' % (model, metaFeature)
#                 features.append(feature)
#                 fold[feature] = fold[model].values * train.iloc[test_indices[i-1]][metaFeature].values

        stackingModel.fit(trainDF[features].values)

        X_train, X_test, y_train, y_test = train_test_split(fold[features].values, labels.iloc[test_indices[i-1]].values, test_size=0.20, random_state=0)
        stackingModel, cutPoints, predictions, YTest = calculateStackingResults(X_train, y_train, X_test, y_test)        
        overallTestQwk = quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, YTest)
        
        result = StackingResults(stackingModel, cutPoints, overallTestQwk) 
        resultList.append(result)    
            
        num += 1

    return resultList

In [6]:
modelsToUse = ['Keras', 'BaggingLinearRegression_n_estimators=10', 'LogisticRegression', 'XGBoostRegLin', 'BaggingDescisionTrees_n_estimators=20', 'BaggingDescisionTreeClassifiers_n_estimators=20']
# modelsToUse = ['XGBoostRegLin']
#not using BaggingLinearRegression_n_estimators=10' currently
# modelsToUse = ['Keras']
# bestStackingModel, bestCutPoints, bestFold, _, _ = 
results = GetBestModel(modelsToUse, metaFeatures)

(15835, 2)
Optimization terminated successfully.
         Current function value: -0.648835
         Iterations: 122
         Function evaluations: 245
Test QWK: 0.654457238067

Train QWK: 0.648835025104

(15835, 2)
Optimization terminated successfully.
         Current function value: -0.645496
         Iterations: 94
         Function evaluations: 214
Test QWK: 0.627220698781

Train QWK: 0.645495595143

(15834, 2)
Optimization terminated successfully.
         Current function value: -0.643087
         Iterations: 105
         Function evaluations: 227
Test QWK: 0.666773629234

Train QWK: 0.64308708728



In [None]:
def WritePredictionsToFile(results, modelsToUse, dfTest, fileName):
               
        predictions = np.zeros(len(dfTest))
        
        for i in range(1, len(results)+1):
            X = pd.read_csv('testPredictions%s.csv' % str(i))[modelsToUse]
            result = results[i-1]
            print result.qwk
            predictions += result.qwk * (np.searchsorted(result.cutPoints, result.stackingModel.predict(X)) + 1)
            
        totalWeight = np.sum([result.qwk for result in results])
        predictions /= totalWeight
        print predictions
        predictions = np.round(predictions).astype(int)
            
        
        predDf = pd.DataFrame()
        predDf['Id'] = dfTest['Id']
        predDf['Response'] = predictions
        print predDf['Response'].values
        predDf.to_csv(path_or_buf=fileName, columns=['Id', 'Response'], index=False, header=['Id', 'Response'])

In [None]:
WritePredictionsToFile(results, modelsToUse, dfTest, 'cutPointsStacking.csv')

In [7]:
fold1 = pd.read_csv('fold1.csv')

for i in range(len(fold1.columns)):
    for j in range(i + 1, len(fold1.columns)):        
        print "%s, %s, %s" % (fold1.columns[i], fold1.columns[j], pearsonr(fold1[fold1.columns[i]], fold1[fold1.columns[j]]))

BaggingDescisionTrees_n_estimators=20, BaggingLinearRegression_n_estimators=10, (0.82723223060018414, 0.0)
BaggingDescisionTrees_n_estimators=20, LogisticRegression, (0.71672697762552096, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoost, (0.90545269754757352, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostLinear, (0.82816721211796251, 0.0)
BaggingDescisionTrees_n_estimators=20, Keras100/50Layers6Epochs, (0.81860775406944197, 0.0)
BaggingDescisionTrees_n_estimators=20, Keras, (0.81871121498888999, 0.0)
BaggingDescisionTrees_n_estimators=20, Index, (0.0040305074970553995, 0.57069839828140645)
BaggingDescisionTrees_n_estimators=20, IndexNum, (0.0040305074970553995, 0.57069839828140645)
BaggingDescisionTrees_n_estimators=20, LinXGBoost, (0.9073296629252775, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostKappa, (0.67522197855644062, 0.0)
BaggingDescisionTrees_n_estimators=20, XGBoostRegLin, (0.92200272332950317, 0.0)
BaggingDescisionTrees_n_estimators=20, BaggingDescisionTreeClas

In [None]:
K=3
num = 1
frames = [pd.read_csv('fold%s.csv' % str(i)) for i in range(1, K+1) if i != num  ]
testModelPredictions = pd.read_csv('fold%s.csv' % str(num))

features = list()
for model in modelsToUse:
    for metaFeature in metaFeatures:
        feature = 'Meta_%s_%s' % (model, metaFeature)
        features.append(feature)
        testModelPredictions[feature] = testModelPredictions[model] * dfTrain[metaFeature]
        print foldDf[model][0]
        print dfTrain[metaFeature][0]
        print dfTrain[metaFeature][20000]
        print testModelPredictions[feature][0]
#         print testModelPredictions[feature][20000]
#         print foldDf[model]
        break
    break