In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
import quadratic_weighted_kappa
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
import feature_generator
import xgboost as xgb
from scipy import optimize

In [2]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')    

In [3]:
features = feature_generator.GetFeatures(dfTrain, dfTest, 100)

Medical_History_2
Medical_History_10


In [4]:
class XGBoostModel:
    
    def __init__(self, max_depth):
        self.param = {'max_depth':max_depth, 'eta':10**-1, 'silent':1, 'min_child_weight':3, 'subsample' : 0.7 ,"early_stopping_rounds":10,
          "objective"   : "count:poisson",'eval_metric': 'rmse','colsample_bytree':0.65}

        self.num_round=700
        
    def fit(self, xTrain, yTrain):
        dtrain = xgb.DMatrix(xTrain,label=yTrain)
        watchlist  = [(dtrain,'train')]
        self.bst = xgb.train(self.param, dtrain, self.num_round, watchlist)
        
    def predict(self, testData):
        dTest = xgb.DMatrix(testData)
        return self.bst.predict(dTest)

In [5]:
def WritePredictionsToFile(model, modelName):
    
    kf = KFold(len(dfTrain), 3)
    num = 1
    for train_index, test_index in kf:
        predictionsDF = pd.read_csv('fold%s.csv' % str(num))    
          
        xTrain = dfTrain.iloc[train_index][features].values
        yTrain = dfTrain.iloc[train_index]['Response']       
        model.fit(xTrain, yTrain)
        
        xValidate = dfTrain.iloc[test_index][features].values
        predictions = np.clip(model.predict(xValidate), 1, 8)
        predictionsDF[modelName] = predictions
                        
        predictionsDF.to_csv(path_or_buf='fold%s.csv' % str(num), index=False)
        num += 1
    
    testDF = pd.read_csv('testPredictions.csv')            
    xTest = dfTest[features].values
    testPredictions = np.clip(model.predict(xTest), 1, 8)
    testDF[modelName] = testPredictions
    testDF.to_csv(path_or_buf='testPredictions.csv', index=False)

In [9]:
# WritePredictionsToFile(LogisticRegression(), 'LogisticRegression')
WritePredictionsToFile(BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=20), 'BaggingDescisionTrees_n_estimators=20')
# WritePredictionsToFile(BaggingRegressor(base_estimator=LinearRegression(), n_estimators=10), 'BaggingLinearRegression_n_estimators=10')

In [None]:

scorer = make_scorer(quadratic_weighted_kappa.quadratic_weighted_kappa)
# print len(features)
# print len(dummyVariables)

In [None]:
def qwk_scorer(estimator, X, Y):
    predictions = np.clip(estimator.predict(X), 1, 8)
    return quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, Y)

In [None]:
# bestRandomForest = GetBestModel(lambda: RandomForestRegressor(n_estimators=10, max_depth=10), features) 
# bestRandomForest = GetBestModel(lambda: AdaBoostRegressor(LinearRegression()), features) 
# bestCombinedModel = GetBestModel(GenerateNewCombinedModel, features, 3)

X_train, X_test, y_train, y_test = train_test_split(dfTrain[features].values, dfTrain['Response'], test_size=0.25, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'gamma': [0.25, 0.025], 'C': [0.5, 5]}]
# clf = GridSearchCV(SVR(kernel='rbf', max_iter=1000, epsilon=0.49, tol=0.01, verbose=True), tuned_parameters, cv=3, scoring=qwk_scorer)
clf = GridSearchCV(SVC(kernel='poly', degree=1, max_iter=1000, decision_function_shape='ovr', tol=0.01, verbose=True), tuned_parameters, cv=3, scoring=qwk_scorer)
clf.fit(X_train, y_train)
# SVR(kernel='rbf', max_iter=1, , tol=0.01, verbose=True)  
#C=5, g=0.1
# TestQWK: 0.24807920235173586
#C=2, g=0.05, qwk = 0.21
#C=5, g=0.05, qwk = 0.25465662455597715
# do C 

In [None]:
print clf.best_params_
clfPredictions = np.clip(clf.predict(X_test), 1, 8)

# dataPoints = list()

# folds = (2, 5, 10, 20)
# for K in folds:
#     _, testQwk, trainQwk = GetBestModel(GenerateNewCombinedModel, features, K)
#     dataPoints.append((K, testQwk, trainQwk))

In [None]:
print np.min(clfPredictions)
print np.max(clfPredictions)
print clfPredictions

In [None]:
quadratic_weighted_kappa.quadratic_weighted_kappa(clfPredictions, y_test)

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib.font_manager import FontProperties
# fontProps = FontProperties()
# fontProps.set_size('small')


# plt.figure(1)
# train_data = [point[2] for point in dataPoints]
# test_data = [point[1] for point in dataPoints]
# plt.plot(folds, train_data, label="Training Data")
# plt.plot(folds, test_data, label="Test Data")
# plt.title("Learning Curve for Logistic Regression")
# plt.xlabel("Training Set Size")
# plt.ylabel("Classification Accuracy")
# plt.legend(prop = fontProps)
# x1,x2,y1,y2 = plt.axis()
# plt.axis((x1,x2,y1,1.1))
# plt.show()

# print dataPoints

In [None]:
trainPredictions = bestCombinedModel[0].predict(dfTrain[features].values)
trainPredictions = np.rint(trainPredictions).astype(int)
print quadratic_weighted_kappa.quadratic_weighted_kappa(trainPredictions, dfTrain.Response)

In [None]:
bestPredictions = np.rint(bestCombinedModel[0].predict(dfTest[features].values)).astype(int)
bestPredictions = np.rint(bestPredictions).astype(int)
dfTest['Predictions'] = bestPredictions
print np.max(bestPredictions)

In [None]:
dfTest.to_csv(path_or_buf='Combo.csv', columns=['Id', 'Predictions'], index=False, header=['Id', 'Response'])

In [None]:
features = ['Ins_Age', 'BMI']
print len(dfTrain)
print len(dfTest)

In [None]:
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

keywords = ['Medical_Keyword_' + str(i) for i in range(1, 49)]
# print dfTrain[keywords].sum(axis=1)

# for keyword in keywords:
#     print pearsonr(dfTrain[keyword], dfTrain.Response)
#     break

# uniqueValues = dfTest['InsuredInfo_7'].unique()
# for i in range(len(uniqueValues)):
#     arr = dfTrain['InsuredInfo_7'].apply(lambda x: x == uniqueValues[i])
    

for column in ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6', 'Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5']:
   
    if dfTrain[column].isnull().sum():
        print column
        print pearsonr(dfTrain[dfTrain[column].notnull()][column], dfTrain[dfTrain[column].notnull()].Response)
        print dfTrain[column].median()
        print len(dfTrain[dfTrain.Response == 8])
        print len(dfTrain)
#         print dfTrain[column]
#         plt.plot(dfTrain[dfTrain[column].notnull()][column], dfTrain[dfTrain[column].notnull()].Response)
        break
# plt.show()
# print dfTest['InsuredInfo_4'].unique()
# print dfTrain['InsuredInfo_6'].apply(lambda x: x == 1)
print dfTest['Medical_History_32'].isnull().sum()
            
print dfTrain['Medical_History_32'].median()
print dfTrain[dfTrain['Family_Hist_2'].notnull()]['Family_Hist_2'].max()
# print dfTrain['Medical_History_32'].null().sum()
# print dfTest[dfTest['Medical_History_32'].notnull()]['Medical_History_32'].median()
# print len(pd.concat([dfTest[dfTest['Medical_History_32'].notnull()]['Medical_History_32'], dfTrain[dfTrain['Medical_History_32'].notnull()]['Medical_History_32']]))
# print pearsonr(dfTrain[dfTrain['Medical_History_32'].notnull()]['Medical_History_32'], dfTrain[dfTrain['Medical_History_32'].notnull()]['Response'])

In [None]:
neighborModel = GetBestModel(lambda: NearestNeighbors(), features)

In [None]:
nearestNeighbors = neighborModel.kneighbors(dfTrain[features].values)

In [None]:
predictions = np.zeros(len(dfTrain))
for i in range(len(dfTrain)):
#     predictions[i] = dfTrain.iloc[nearestNeighbors[1][i]].Response.mean()
    responses = dfTrain.iloc[nearestNeighbors[1][i]].Response
    weights = nearestNeighbors[0][i][4] - nearestNeighbors[0][i]
    predictions[i] = np.sum(responses * weights) / np.sum(weights)

In [None]:
predictions = np.rint(predictions)
print quadratic_weighted_kappa.quadratic_weighted_kappa(predictions, dfTrain.Response)
print np.min(predictions)

In [None]:
import scipy.stats as stats
lowerBound = np.mean(dfTrain['BMI']) - np.std(dfTrain['BMI'])
upperBound = np.mean(dfTrain['BMI']) + np.std(dfTrain['BMI'])
mean = np.mean(dfTrain['BMI'])
stats.normaltest(dfTrain['BMI'])

bmiScore = dfTrain['BMI'].apply(lambda x: max(0, lowerBound - x) if x < mean else max(0, x - upperBound))

model = LinearRegression(normalize=True)
features = ['Ins_Age', 'Wt', 'BMI']
X = np.ndarray((59381, 9))
X[:,0] = dfTrain['Ins_Age'].values
X[:,1] = dfTrain['BMI'].values
X[:,2] = dfTrain['Wt'].values
X[:,3] = dfTrain['Ins_Age'] * dfTrain['Wt']
X[:,4] = dfTrain['Ins_Age'] * dfTrain['BMI']
X[:,5] = dfTrain['Ins_Age'] * dfTrain['Ins_Age']
X[:,6] = dfTrain['Wt'] * dfTrain['Wt']
X[:,7] = dfTrain['Wt'] * dfTrain['BMI']
X[:,8] = dfTrain['Wt'] * dfTrain['Ins_Age']

model.fit(X, dfTrain['Response'])

In [None]:
class CutPointOptimizer:
    
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual

    def qwk(self, cutPoints):
        transformedPredictions = np.searchsorted(cutPoints, self.predicted) + 1            
        return -1 * quadratic_weighted_kappa.quadratic_weighted_kappa(transformedPredictions, self.actual)

initialCutPoints = np.array([1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5])

In [None]:

# Product_Info_2
# Product_Info_3
# Employment_Info_2
# InsuredInfo_3
# Medical_History_2
# Medical_History_10

var = 'Product_Info_3'
for val in dfTrain[var].unique():
    hits = dfTrain[dfTrain[var] == val]
    print '%s, %s, %s' % (val, len(hits), np.mean(hits.Response))