In [None]:
import string
import re
import glob
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping, ReduceLROnPlateau
from keras.layers.advanced_activations import PReLU
from keras.metrics import top_k_categorical_accuracy
from keras.models import Sequential, Model, load_model
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import hamming_loss, f1_score
from keras.optimizers import SGD,RMSprop,Adam
import tensorflow as tf
import sys
from sklearn import preprocessing
from sklearn.metrics import label_ranking_average_precision_score, coverage_error, label_ranking_loss
import pickle
import json
import csv
from tqdm import tqdm
from keras.losses import binary_crossentropy,mean_squared_error
from keras import backend as K


np.random.seed(31)
tf.set_random_seed(17)

CLEAN_DF_BASE = ''
EMB_BASE = ''

df = pd.read_csv( CLEAN_DF_BASE + 'cleaned_train_punc_brute.csv')
df_test = pd.read_csv( CLEAN_DF_BASE + 'cleaned_test_punc_brute.csv')

MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 400
VALIDATION_SAMPLES = 5000
EMBEDDING_DIM = 300
BATCH_SIZE = 512
MAX_TAGS = 512

with open(EMB_BASE + 'word_index_punc_brute','r') as f:
    word_index = json.load(f)
print(word_index.get("python"))

def read_label(raw_data):
    
    l = len(raw_data)
    label_list = [] #np.empty((raw_data.shape[0],len(tagCol)))
    ll =[]
    tagCol = [col for col in raw_data.columns if 'Tag_' in col]
    for j in range(0,raw_data.shape[0]):
        for i in range(0,len(tagCol)):
            if(raw_data.loc[raw_data.index[j],tagCol[i]]==1):
                ll.append(tagCol[i])
                #print(ll)
        label_list.append(ll)
        ll = []
        #print(label_list)
    return label_list

label_list = read_label(df)
label_list

df['tagslist'] = label_list
    
#df['tagslist'] = df.apply(lambda row: read_label(row), axis=1)
label_counts = {}
 
for tags in df['tagslist']:
    for tag in tags:
        try:
            label_counts[tag] += 1
        except:
            label_counts[tag] = 1
all_sorted = sorted(label_counts.items(),key=lambda x : x[1],reverse=True)
topk = all_sorted[:MAX_TAGS]
topk = dict(topk)
total_all = 0
for key,val in all_sorted:
    total_all += val
total_k = 0
for key,val in topk.items():
    total_k += val
total_k/total_all

def topkTags(row):
    tags = []
    for tag in row.tagslist:
        if tag in topk.keys():
            tags.append(tag)
    return list(set(tags))            
    
df['new_tagslist'] = df.apply(lambda row: topkTags(row), axis=1)

label_index = {}
i = 0
for tags in df['new_tagslist'].values:
    for tag in tags:
        try:
            label_index[tag]
        except:
            label_index[tag] = i
            i += 1
label_index.get('python')

with open('label_index1','w') as f:
    json.dump(label_index,f)

def datagen(df,batch_size=128,sequence_length=100):
    x,y = [],[]
    while True:
        i = 0
        while i < len(df):
            if len(x) < batch_size:
                seq = []
                for word in df['texts'][i].split():
                    if word_index.get(word.lower()):
                        seq.append(word_index.get(word.lower()))
                for _ in range(sequence_length):
                    seq.append(0)
                if len(seq) > sequence_length:
                    seq = seq[:sequence_length]
                label = []
                for tag in df['new_tagslist'][i]:
                    label.append(label_index.get(tag.lower()))
                label_vec = np.zeros(len(label_index))
                for l in label:
                    label_vec[l] += 1
                x.append(seq)
                y.append(label_vec)
            else:
                yield np.array(x),np.array(y)
                x,y = [],[]
            i += 1

train_df,val_df = train_test_split(df,test_size=VALIDATION_SAMPLES,random_state=31)
train_gen = datagen(train_df.reset_index(drop=True),batch_size=BATCH_SIZE,sequence_length=MAX_SEQUENCE_LENGTH)

valid_gen = datagen(val_df.reset_index(drop=True),batch_size=len(val_df),sequence_length=MAX_SEQUENCE_LENGTH)
valid_x, valid_y = [], []
for x,y in valid_gen:
    valid_x.extend(list(x))
    valid_y.extend(list(y))
    break
valid_x = np.array(valid_x)
valid_y = np.array(valid_y)
valid_x.shape,valid_y.shape

test_x = []
for i in tqdm(range(len(df_test))):
    seq = []
    for word in df_test['texts'][i].split():
        if word_index.get(word.lower()):
            seq.append(word_index.get(word.lower()))
    for _ in range(MAX_SEQUENCE_LENGTH):
        seq.append(0)
    if len(seq) > MAX_SEQUENCE_LENGTH:
        seq = seq[:MAX_SEQUENCE_LENGTH]
    test_x.append(seq)

from keras.engine.topology import Layer
#https://www.kaggle.com/qqgeogor/keras-lstm-attention-glove840b-lb-0-043
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

#https://github.com/arthurdouillard/keras-snapshot_ensembles/blob/master/snapshot.py
import math
import os
import glob

from keras import backend as K
from keras.layers import *
from keras.models import Model
from keras.callbacks import Callback

class Snapshot(Callback):

    def __init__(self, folder_path, nb_epochs, nb_cycles=5, verbose=0):
        if nb_cycles > nb_epochs:
            raise ValueError('nb_epochs has to be lower than nb_cycles.')

        super(Snapshot, self).__init__()
        self.verbose = verbose
        self.folder_path = folder_path
        self.nb_epochs = nb_epochs
        self.nb_cycles = nb_cycles
        self.period = self.nb_epochs // self.nb_cycles
        self.nb_digits = len(str(self.nb_cycles))
        self.path_format = os.path.join(self.folder_path, 'weights_cycle_{}.h5')


    def on_epoch_end(self, epoch, logs=None):
        if epoch == 0 or (epoch + 1) % self.period != 0: return
        # Only save at the end of a cycle, a not at the beginning

        if not os.path.exists(self.folder_path):
            os.makedirs(self.folder_path)

        cycle = int(epoch / self.period)
        cycle_str = str(cycle).rjust(self.nb_digits, '0')
        self.model.save_weights(self.path_format.format(cycle_str), overwrite=True)

        # Resetting the learning rate
        K.set_value(self.model.optimizer.lr, self.base_lr)

        if self.verbose > 0:
            print('\nEpoch %05d: Reached %d-th cycle, saving model.' % (epoch, cycle))


    def on_epoch_begin(self, epoch, logs=None):
        if epoch <= 0: return

        lr = self.schedule(epoch)
        K.set_value(self.model.optimizer.lr, lr)

        if self.verbose > 0:
            print('\nEpoch %05d: Snapchot modifying learning '
                  'rate to %s.' % (epoch + 1, lr))


    def set_model(self, model):
        self.model = model
        if not hasattr(self.model.optimizer, 'lr'):
            raise ValueError('Optimizer must have a "lr" attribute.')

        # Get initial learning rate
        self.base_lr = float(K.get_value(self.model.optimizer.lr))


    def schedule(self, epoch):
        lr = math.pi * (epoch % self.period) / self.period
        lr = self.base_lr / 2 * (math.cos(lr) + 1)
        return lr

def get_model(emb_w = None,input_layer = None):
    inp = input_layer if input_layer is not None else Input(shape=(MAX_SEQUENCE_LENGTH,)) 
    if emb_w is None:
        emb = Embedding(MAX_NB_WORDS,EMBEDDING_DIM)(inp)
    else:
        emb = Embedding(MAX_NB_WORDS,emb_w.shape[1],weights = [emb_w])(inp)
    emb = SpatialDropout1D(0.2)(emb)
    x = CuDNNGRU(len(label_index)//2, return_sequences=True)(emb)
    x_avg = GlobalMaxPooling1D()(x)
    y = CuDNNLSTM(len(label_index)//2, return_sequences=True)(emb)
    y_avg = GlobalMaxPooling1D()(y)
    p = Concatenate(axis=1)([x_avg,y_avg]) 
    x = Attention(MAX_SEQUENCE_LENGTH)(x)
    y = Attention(MAX_SEQUENCE_LENGTH)(y)
    q = Concatenate(axis=1)([x,y])
    z = Concatenate(axis=-1)([p,q])
    z = Dropout(0.15)(z)
    z = BatchNormalization()(z)
    z = Dense(len(label_index), activation="relu")(z)
    z = Dropout(0.15)(z)
    z = BatchNormalization()(z)
    z = Dense(len(label_index), activation="sigmoid")(z)
    model = Model(inputs=inp, outputs=z)
    return model

#https://github.com/arthurdouillard/keras-snapshot_ensembles/blob/master/example.py
def load_ensemble(folder, keep_last=None):
    paths = glob.glob(os.path.join(folder, 'weights_cycle_*.h5'))
    print('Found:', ', '.join(paths))
    if keep_last is not None:
        paths = sorted(paths)[-keep_last:]
    print('Loading:', ', '.join(paths))

    x_in = Input(shape=(MAX_SEQUENCE_LENGTH,))
    outputs = []

    for i, path in enumerate(paths):
        m = get_model(input_layer=x_in)
        m.load_weights(path)
        outputs.append(m.output)

    shape = outputs[0].get_shape().as_list()
    x = Lambda(lambda x: K.mean(K.stack(x, axis=0), axis=0),
               output_shape=lambda _: shape)(outputs)
    model = Model(inputs=x_in, outputs=x)
    return model

EMBEDDING_FILE = EMB_BASE + 'matrix_glove_punc_brute'
with open(EMBEDDING_FILE, 'rb') as f:
    embedding_matrix = pickle.load(f)

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'))
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'))
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'))
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'))

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)
def dice_coef(y_true, y_pred, smooth=0):
    intersection = K.sum(y_true * y_pred)
    union = K.sum(y_true) + K.sum(y_pred)
    return K.mean( (2. * intersection + smooth) / (union + smooth))
def dice_p_bce(in_gt, in_pred):
    return binary_crossentropy(in_gt, in_pred) - 0.1*dice_coef(in_gt, in_pred,smooth=1)

model = get_model(emb_w=embedding_matrix)
model.summary()

model.compile(loss=dice_p_bce, optimizer=RMSprop(lr=1e-3), metrics=[f1])
snapshot = Snapshot('snapshots1', nb_epochs=12, verbose=1, nb_cycles=3)

model.fit_generator(
    train_gen,
    steps_per_epoch=len(train_df)//BATCH_SIZE,
    epochs=4,
    verbose=1,
    validation_data=(valid_x,valid_y),)

model.fit_generator(
    train_gen,
    steps_per_epoch=len(train_df)//BATCH_SIZE,
    epochs=12,
    verbose=1,
    callbacks=[snapshot],
    validation_data=(valid_x,valid_y),)

model = load_ensemble('snapshots1')
valid_preds = model.predict(valid_x,verbose=1)
thresholds = np.linspace(0,1,200)
max_f1 = -1
best_thres = -1
for thres in tqdm(thresholds):
    f1 = f1_score(valid_y,valid_preds>thres,average='micro')
    if f1 > max_f1:
        max_f1 = f1
        best_thres = thres
print(max_f1,best_thres)
pred_test_y = model.predict([test_x], batch_size=256, verbose=1)

pred_test_y = (pred_test_y>best_thres).astype(int)
label_index_inverse = {val:key for key,val in label_index.items()}
def vector_to_tags(vec):
    tags = []
    for i,v in enumerate(vec):
        if v == 1:
            tags.append(label_index_inverse[i])
    return '|'.join(tags)
vector_to_tags(pred_test_y[2]>best_thres)

df_test['tags'] = [vector_to_tags(pred>best_thres) for pred in tqdm(pred_test_y)]
df_test.drop(columns=['texts'],inplace=True)
df_test.to_csv('result.csv',index=False)
df_test