In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
import gc
from operator import add
from tensorflow import feature_column
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, DenseFeatures, Input, BatchNormalization
from sklearn.model_selection import GroupKFold
from sklearn.metrics import confusion_matrix
from tensorflow.keras.metrics import Metric
import tensorflow.keras.backend as K

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def prepare_train_data(train_dataframe: pd.DataFrame, labels: pd.DataFrame) -> pd.DataFrame:
    game_session_id_list = labels['game_session'].unique()

    train_dataframe.drop(train_dataframe[~train_dataframe['game_session'].isin(game_session_id_list)].index, inplace=True)
    train_grouped = train_dataframe.groupby('game_session').last()
    train_event_codes = train_dataframe.groupby(['game_session', 'event_code'], as_index=False).size().unstack(fill_value=0)
    train_dataframe = pd.merge(train_grouped, train_event_codes,
                               left_on='game_session', right_on='game_session', how='left')
    train_dataframe = pd.merge(train_dataframe, labels,
                               left_on='game_session', right_on='game_session', how='left')
    install_ids = train_dataframe.copy()
    train_dataframe.drop(['game_session', 'event_id', 'timestamp', 'event_data', 'installation_id_x',
                          'installation_id_y', 'title_y', 'num_correct', 'num_incorrect', 'accuracy'], inplace=True, axis=1)
    return train_dataframe, install_ids

In [None]:
def prepare_features(dataframe: pd.DataFrame) -> []:
    feature_columns = []

    for header in dataframe.select_dtypes('number'):
        feature_columns.append(feature_column.numeric_column(header))

    title_x_one_hot = feature_column.categorical_column_with_vocabulary_list(
        'title_x', dataframe.title_x.unique())
    title_x_one_hot = feature_column.indicator_column(title_x_one_hot)
    feature_columns.append(title_x_one_hot)

    type_one_hot = feature_column.categorical_column_with_vocabulary_list(
        'type', dataframe.type.unique())
    type_one_hot = feature_column.indicator_column(type_one_hot)
    feature_columns.append(type_one_hot)

    world_one_hot = feature_column.categorical_column_with_vocabulary_list(
        'world', dataframe.world.unique())
    world_one_hot = feature_column.indicator_column(world_one_hot)
    feature_columns.append(world_one_hot)

    return feature_columns

In [None]:
def prepare_test_data(train_dataframe: pd.DataFrame) -> pd.DataFrame:

    train_grouped = train_dataframe.groupby('game_session').last()
    train_event_codes = train_dataframe.groupby(['game_session', 'event_code'], as_index=False).size().unstack(fill_value=0)
    train_dataframe = pd.merge(train_grouped, train_event_codes,
                               left_on='game_session', right_on='game_session', how='left')

    train_dataframe.drop(['event_id', 'timestamp', 'event_data'
                          ], inplace=True, axis=1)
    return train_dataframe



In [None]:
def create_sequential_model(feature_layer) -> Sequential:
    return tf.keras.Sequential([
        feature_layer,
        BatchNormalization(),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dense(4, activation='softmax')
    ])

In [None]:
def create_dataset(dataframe: pd.DataFrame, is_train: bool, batch_size=32, shuffle=True):

    labels = dataframe.pop('accuracy_group')
    labels = tf.keras.utils.to_categorical(labels, num_classes=4)
    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))

    return dataset.batch(batch_size)

In [None]:
def create_test_dataset(dataframe: pd.DataFrame, is_train: bool, batch_size=32, shuffle=True):

    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe)))


    return dataset.batch(batch_size)

In [None]:
def quadratic_kappa(actuals, preds, N=4):
    """This function calculates the Quadratic Kappa Metric used for Evaluation in the PetFinder competition
    at Kaggle. It returns the Quadratic Weighted Kappa metric score between the actual and the predicted values 
    of adoption rating."""
    w = np.zeros((N,N))
    O = confusion_matrix(actuals, preds)
    for i in range(len(w)): 
        for j in range(len(w)):
            w[i][j] = float(((i-j)**2)/(N-1)**2)
    
    act_hist=np.zeros([N])
    for item in actuals: 
        act_hist[item]+=1
    
    pred_hist=np.zeros([N])
    for item in preds: 
        pred_hist[item]+=1
                         
    E = np.outer(act_hist, pred_hist);
    E = E/E.sum();
    O = O/O.sum();
    
    num=0
    den=0
    for i in range(len(w)):
        for j in range(len(w)):
            num+=w[i][j]*O[i][j]
            den+=w[i][j]*E[i][j]
    return (1 - (num/den))

In [None]:
class CohenKappa(Metric):
    """
    This metric is copied from TensorFlow Addons
    """
    def __init__(self,
                 num_classes,
                 name='cohen_kappa',
                 weightage=None,
                 dtype=tf.float32):
        super(CohenKappa, self).__init__(name=name, dtype=dtype)

        if weightage not in (None, 'linear', 'quadratic'):
            raise ValueError("Unknown kappa weighting type.")
        else:
            self.weightage = weightage

        self.num_classes = num_classes
        self.conf_mtx = self.add_weight(
            'conf_mtx',
            shape=(self.num_classes, self.num_classes),
            initializer=tf.keras.initializers.zeros,
            dtype=tf.int32)
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        if len(y_true.shape) == 2:
            y_true = tf.argmax(y_true, axis=1)
        if len(y_pred.shape) == 2:
            y_pred = tf.argmax(y_pred, axis=1)
        
        y_true = tf.cast(y_true, dtype=tf.int32)
        y_pred = tf.cast(y_pred, dtype=tf.int32)
        
        if y_true.shape.as_list() != y_pred.shape.as_list():
            raise ValueError(
                "Number of samples in y_true and y_pred are different")

        # compute the new values of the confusion matrix
        new_conf_mtx = tf.math.confusion_matrix(
            labels=y_true,
            predictions=y_pred,
            num_classes=self.num_classes,
            weights=sample_weight)

        # update the values in the original confusion matrix
        return self.conf_mtx.assign_add(new_conf_mtx)
    
    def result(self):
        nb_ratings = tf.shape(self.conf_mtx)[0]
        weight_mtx = tf.ones([nb_ratings, nb_ratings], dtype=tf.int32)

        # 2. Create a weight matrix
        if self.weightage is None:
            diagonal = tf.zeros([nb_ratings], dtype=tf.int32)
            weight_mtx = tf.linalg.set_diag(weight_mtx, diagonal=diagonal)
            weight_mtx = tf.cast(weight_mtx, dtype=tf.float32)

        else:
            weight_mtx += tf.range(nb_ratings, dtype=tf.int32)
            weight_mtx = tf.cast(weight_mtx, dtype=tf.float32)

            if self.weightage == 'linear':
                weight_mtx = tf.abs(weight_mtx - tf.transpose(weight_mtx))
            else:
                weight_mtx = tf.pow((weight_mtx - tf.transpose(weight_mtx)), 2)
            weight_mtx = tf.cast(weight_mtx, dtype=tf.float32)

        # 3. Get counts
        actual_ratings_hist = tf.reduce_sum(self.conf_mtx, axis=1)
        pred_ratings_hist = tf.reduce_sum(self.conf_mtx, axis=0)

        # 4. Get the outer product
        out_prod = pred_ratings_hist[..., None] * \
                    actual_ratings_hist[None, ...]

        # 5. Normalize the confusion matrix and outer product
        conf_mtx = self.conf_mtx / tf.reduce_sum(self.conf_mtx)
        out_prod = out_prod / tf.reduce_sum(out_prod)

        conf_mtx = tf.cast(conf_mtx, dtype=tf.float32)
        out_prod = tf.cast(out_prod, dtype=tf.float32)

        # 6. Calculate Kappa score
        numerator = tf.reduce_sum(conf_mtx * weight_mtx)
        denominator = tf.reduce_sum(out_prod * weight_mtx)
        kp = 1 - (numerator / denominator)
        return kp
    
    def get_config(self):
        """Returns the serializable config of the metric."""

        config = {
            "num_classes": self.num_classes,
            "weightage": self.weightage,
        }
        base_config = super(CohenKappa, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def reset_states(self):
        """Resets all of the metric state variables."""

        for v in self.variables:
            K.set_value(
                v, np.zeros((self.num_classes, self.num_classes), np.int32))

In [None]:
INPUT_PATH = '../../kaggle/input/data-science-bowl-2019/'

In [None]:
train = pd.read_csv(INPUT_PATH + 'train.csv')
labels = pd.read_csv(INPUT_PATH + 'train_labels.csv')

In [None]:
train, install_ids = prepare_train_data(train, labels)
train.columns = train.columns.astype(str)
train_copy = train.copy()
train_copy.drop('accuracy_group', inplace=True, axis=1)
feature_columns = prepare_features(train_copy)
del train_copy
gc.collect()

In [None]:
feature_layer = DenseFeatures(feature_columns)

In [None]:
gkf = GroupKFold(n_splits=5)

In [None]:
models = []
kappas = []

In [None]:
train

In [None]:
for train_idx, val_idx in gkf.split(train, groups=install_ids.installation_id_x):
    train_dataset = create_dataset(train.iloc[train_idx].copy(), True, 1024, shuffle=True)
    val_dataset = create_dataset(train.iloc[val_idx].copy(), True, 1024, shuffle=False)
    model = create_sequential_model(feature_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[CohenKappa(num_classes=4, weightage='quadratic')])
    model.fit(train_dataset,
          validation_data=val_dataset,
          epochs=150, verbose=2)
    models.append(model)
    y_valid = install_ids.iloc[val_idx].copy()
    y_pred = model.predict(val_dataset)
    y_valid['preds'] = y_pred.argmax(axis=1)
    y_valid = y_valid.groupby('installation_id_x').last()
    kappa = quadratic_kappa(y_valid.accuracy_group, y_valid.preds)
    kappas.append(kappa)
        

In [None]:
train.iloc[train_idx].to_csv('group_by_inst_id.csv')

In [None]:
train.iloc[val_idx].to_csv('val_group_by_inst_id.csv')

In [None]:
print(kappas)

In [None]:
quadratic_kappa(y_valid.accuracy_group, y_valid.preds)

In [None]:
print(f'predicted accuracy_group distribution:\n\n{pd.Series(y_valid.preds).value_counts(normalize=True)} \n\n')

In [None]:
test = pd.read_csv(INPUT_PATH + 'test.csv')
test = test[test['type']=='Assessment']

In [None]:
test = prepare_test_data(test)
test.columns = test.columns.astype(str)

In [None]:
test.type.unique()

In [None]:
test = test.reset_index()

In [None]:
test.drop(['game_session'], inplace=True, axis=1)

In [None]:
test.rename(columns={'title' : 'title_x'}, inplace=True)

test_dataset = create_test_dataset(test, True, 1024, shuffle=False)

In [None]:
preds = None
for model in models:
    if preds is None:
        preds = model.predict(test_dataset)
    else:
        preds += model.predict(test_dataset)

In [None]:
print(f'predicted accuracy_group distribution:\n\n{pd.Series(preds.argmax(axis=1)).value_counts(normalize=True)} \n\n')

In [None]:
results = test.copy()
results['target'] = preds.argmax(axis=1)
test = test.groupby('installation_id').last()
submission = pd.read_csv(INPUT_PATH+'sample_submission.csv')
results = results.target.reset_index(drop=True)
submission['accuracy_group'] = results
submission.to_csv('submission.csv', index=False)

In [None]:
print(f'predicted accuracy_group distribution:\n\n{pd.Series(submission.accuracy_group).value_counts(normalize=True)} \n\n')

In [None]:
preds.argmax(axis=1)[:10]