In [1]:
import numpy as np
import pandas as pd
import os
import scipy as sp
from functools import partial
from sklearn import metrics
from collections import Counter
import json


class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            else:
                X_p[i] = 3

        ll = metrics.cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            else:
                X_p[i] = 3
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

from data_science_bowl_2019_utility_script import *

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score
from xgboost import XGBRegressor


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')
# Any results you write to the current directory are saved as output.

/kaggle/input/data-science-bowl-2019-compiling-data/__output__.json
/kaggle/input/data-science-bowl-2019-compiling-data/test_df_compiled.csv
/kaggle/input/data-science-bowl-2019-compiling-data/train_df_compiled.csv
/kaggle/input/data-science-bowl-2019-compiling-data/custom.css
/kaggle/input/data-science-bowl-2019-compiling-data/__results__.html
/kaggle/input/data-science-bowl-2019-compiling-data/__notebook__.ipynb
/kaggle/input/data-science-bowl-2019/sample_submission.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/train.csv
/kaggle/input/data-science-bowl-2019/test.csv
/kaggle/input/data-science-bowl-2019-compiling-data/__output__.json
/kaggle/input/data-science-bowl-2019-compiling-data/test_df_compiled.csv
/kaggle/input/data-science-bowl-2019-compiling-data/train_df_compiled.csv
/kaggle/input/data-science-bowl-2019-compiling-data/custom.css
/kaggle/input/data-science-bowl-2019-compiling-data

# Reading Data

In [3]:
train_df, test_df, train_labels_df, specs_df, sample_submission_df = read_data()

Read data
train shape: (11341042, 11)
test shape: (1156414, 11)
train labels shape: (17690, 7)
specs shape: (386, 3)
sample submission shape: (1000, 2)


# Compiling data

In [4]:
X = compile_data(train_df, 'train')
X_test = compile_data(test_df, 'test')

In [5]:
# Assign the variable target_accuracy to y
y = X['target_accuracy_group']
# Remove the variables we do not want to be in X
X.drop(['target_accuracy','target_accuracy_group'], axis=1, inplace=True)

# Splitting Data

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1992)

# Modeling

In [7]:
xgb_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                         colsample_bynode=1, colsample_bytree=1, gamma=0,
                         importance_type='gain', learning_rate=0.1, max_delta_step=0,
                         max_depth=3, min_child_weight=1, missing=None, n_estimators=75,
                         n_jobs=1, nthread=None, objective='reg:squarederror',
                         random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                         seed=None, silent=None, subsample=1, verbosity=1)

In [8]:
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_valid)

In [9]:
xgb_optR = OptimizedRounder()
xgb_optR.fit(xgb_predictions, y_valid)
xgb_coefficients = xgb_optR.coefficients()

In [10]:
xgb_model.fit(X, y)
xgb_predictions = xgb_model.predict(X_test)

In [11]:
xgb_valid_predictions = xgb_optR.predict(xgb_predictions, xgb_coefficients)
xgb_valid_predictions = xgb_valid_predictions.astype(int)

# Submission

In [12]:
# Making the submission dataframe
output = pd.DataFrame({'installation_id': X_test.index,
                       'accuracy_group': xgb_valid_predictions})
# Writing the datafrmae to the output
output.to_csv('submission.csv', index=False)