In [2]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta
from keras.layers.normalization import BatchNormalization
import quadratic_weighted_kappa
from sklearn.cross_validation import KFold
import feature_generator

Using Theano backend.


In [3]:
class NN:
    #I made a small wrapper for the Keras model to make it more scikit-learn like
    #I think they have something like this built in already, oh well
    #See http://keras.io/ for parameter options
    def __init__(self, inputShape, layers, dropout = [], activation = 'relu', init = 'uniform', loss = 'rmse', optimizer = 'adadelta', nb_epochs = 50, batch_size = 32, verbose = 1):

        model = Sequential()
        for i in range(len(layers)):
            if i == 0:
                print ("Input shape: " + str(inputShape))
                print ("Adding Layer " + str(i) + ": " + str(layers[i]))
                model.add(Dense(layers[i], input_dim = inputShape, init = init))
            else:
                print ("Adding Layer " + str(i) + ": " + str(layers[i]))
                model.add(Dense(layers[i], init = init))
            print ("Adding " + activation + " layer")
            model.add(Activation(activation))
            model.add(BatchNormalization())
            if len(dropout) > i:
                print ("Adding " + str(dropout[i]) + " dropout")
                model.add(Dropout(dropout[i]))
        model.add(Dense(1, init = init)) #End in a single output node for regression style output
        model.compile(loss=loss, optimizer=optimizer)
        
        self.model = model
        self.nb_epochs = nb_epochs
        self.batch_size = batch_size
        self.verbose = verbose

    def fit(self, X, y): 
        self.model.fit(X, y, nb_epoch=self.nb_epochs, batch_size=self.batch_size, verbose = self.verbose)
        
    def predict(self, X, batch_size = 128, verbose = 1):
        return self.model.predict(X, batch_size = batch_size, verbose = verbose)

In [4]:
class pdStandardScaler:
    #Applies the sklearn StandardScaler to pandas dataframes
    def __init__(self):
        from sklearn.preprocessing import StandardScaler
        self.StandardScaler = StandardScaler()
    def fit(self, df):
        self.StandardScaler.fit(df)
    def transform(self, df):
        df = pd.DataFrame(self.StandardScaler.transform(df), columns=df.columns)
        return df
    def fit_transform(self, df):
        df = pd.DataFrame(self.StandardScaler.fit_transform(df), columns=df.columns)
        return df
        
def getDummiesInplace(columnList, train, test = None):
    #Takes in a list of column names and one or two pandas dataframes
    #One-hot encodes all indicated columns inplace
    columns = []
    
    if test is not None:
        df = pd.concat([train,test], axis= 0)
    else:
        df = train
        
    for columnName in df.columns:
        index = df.columns.get_loc(columnName)
        if columnName in columnList:
            dummies = pd.get_dummies(df.ix[:,index], prefix = columnName, prefix_sep = ".")
            columns.append(dummies)
        else:
            columns.append(df.ix[:,index])
    df = pd.concat(columns, axis = 1)
    
    if test is not None:
        train = df[:train.shape[0]]
        test = df[train.shape[0]:]
        return train, test
    else:
        train = df
        return train
        
def pdFillNAN(df, strategy = "mean"):
    #Fills empty values with either the mean value of each feature, or an indicated number
    if strategy == "mean":
        return df.fillna(df.mean())
    elif type(strategy) == int:
        return df.fillna(strategy)

In [5]:
def make_dataset(useDummies, fillNANStrategy, useNormalization, train, test):
    data_dir = "./"
    
    labels = train["Response"]
    train.drop(labels = "Id", axis = 1, inplace = True)
    train.drop(labels = "Response", axis = 1, inplace = True)
    test.drop(labels = "Id", axis = 1, inplace = True)
    
    categoricalVariables = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]

    if useDummies == True:
        print ("Generating dummies...")
        train, test = getDummiesInplace(categoricalVariables, train, test)
    
    if fillNANStrategy is not None:
        print ("Filling in missing values...")
        train = pdFillNAN(train, fillNANStrategy)
        test = pdFillNAN(test, fillNANStrategy)

    if useNormalization == True:
        print ("Scaling...")
        scaler = pdStandardScaler()
        train = scaler.fit_transform(train)
        test = scaler.transform(test)
    
    return train, test, labels

In [6]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

In [32]:
# features = feature_generator.GetFeatures(dfTrain, dfTest, 100)

Medical_History_2
Medical_History_10


In [8]:
print ("Creating dataset...") 
modelName = 'Keras100/50Layers6Epochs'
kf = KFold(len(dfTrain), 3)
num = 1
num_inputs = train.shape[1]

clf = NN(inputShape = num_inputs, layers = [100, 50], dropout = [0.5, 0.5], activation='sigmoid', loss='mae', optimizer = 'adadelta', init = 'glorot_normal', nb_epochs = 6)

for train_index, test_index in kf:

    predictionsDF = pd.read_csv('fold%s.csv' % str(num))    
#     xTrain = dfTrain.iloc[train_index][features].values
#     yTrain = dfTrain.iloc[train_index]['Response'].values 
    
    xTrain = train.iloc[train_index].values
    yTrain = labels.iloc[train_index].values 
    clf.fit(xTrain, yTrain)

    xValidate = train.iloc[test_index].values
    yValidate = labels.iloc[test_index]
    predictions = np.clip(clf.predict(xValidate), 1, 8)
#     predictions = np.clip(clf.predict(train.values), 1, 8)
    trainPredictions = np.clip(clf.predict(xTrain), 1, 8)

    print quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, yValidate)
    print quadratic_weighted_kappa.quadratic_weighted_kappa(trainPredictions, yTrain)
    
    predictionsDF[modelName] = predictions
    predictionsDF.to_csv(path_or_buf='fold%s.csv' % str(num), index=False)
    num += 1

testDF = pd.read_csv('testPredictions.csv')            
xTest = test.values
testPredictions = np.clip(clf.predict(xTest), 1, 8)
testDF[modelName] = testPredictions
testDF.to_csv(path_or_buf='testPredictions.csv', index=False)
    
# print ("Training model...")
# clf.fit(train, labels)

# print ("Making predictions...")
# pred = clf.predict(test)
# predClipped = np.clip(np.round(pred), 1, 8).astype(int) #Make the submissions within the accepted range

# submission = pd.read_csv('../input/sample_submission.csv')
# submission["Response"] = predClipped
# submission.to_csv('NNSubmission.csv', index=False)

Creating dataset...
Input shape: 1077
Adding Layer 0: 100
Adding sigmoid layer
Adding 0.5 dropout
Adding Layer 1: 50
Adding sigmoid layer
Adding 0.5 dropout
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
0.587861528082
0.616101674688
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
0.587819393989
0.618205808878
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
0.622001120047
0.63897517134


In [7]:
train, test, labels = make_dataset(True, "mean", True, dfTrain, dfTest)

Generating dummies...
Filling in missing values...
Scaling...


In [50]:
print labels.iloc[train_index]

19794    8
19795    8
19796    8
19797    8
19798    1
19799    1
19800    2
19801    2
19802    7
19803    2
19804    8
19805    2
19806    7
19807    2
19808    6
19809    6
19810    1
19811    2
19812    1
19813    1
19814    6
19815    8
19816    8
19817    7
19818    7
19819    8
19820    3
19821    8
19822    7
19823    2
        ..
59351    5
59352    6
59353    6
59354    6
59355    6
59356    6
59357    6
59358    6
59359    6
59360    5
59361    2
59362    7
59363    6
59364    1
59365    5
59366    8
59367    6
59368    8
59369    2
59370    4
59371    6
59372    2
59373    8
59374    7
59375    8
59376    4
59377    7
59378    8
59379    8
59380    7
Name: Response, dtype: int64


[[ 8.      ]
 [ 3.859828]
 [ 8.      ]
 ..., 
 [ 8.      ]
 [ 8.      ]
 [ 8.      ]]
