In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve


import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.preprocessing import sequence
from keras.layers import Input, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.models import Model

import transformers

In [2]:
df = pd.read_csv("../input/final-data-combined/coments.csv")

In [3]:
replace_by_space = re.compile('[\n/(){}\[\]\|@,;]')
STOPWORDS = set(stopwords.words('english'))
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #]')

def clean(s):
    s = s.lower()
    s = replace_by_space.sub(' ',s)
    s = BAD_SYMBOLS_RE.sub('', s)
    s = ' '.join(word for word in s.split() if word not in STOPWORDS) # delete stopwors from text
    return s


df["Text"] = df["Text"].apply(clean)

In [5]:
## tokenize the text and map words to their ids  ("distilbert-base-uncased", "bert-base-uncased")


tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.

MAX_LEN = 40


tokenized = df["Text"].apply(lambda s: tokenizer.encode(s,add_special_tokens=True))



# Pad our input tokens with value 0.

input_ids = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")


label_cols = ["Complaint","Suggestion","Compliment"]
labels = df[label_cols].values

label_cols_complaint = ["Complaint"]
label_cols_sug = ["Suggestion"]
label_cols_compliment = ["Compliment"]

labels_complaint =  df[label_cols_complaint].values
labels_sug =  df[label_cols_sug].values
labels_compliment =  df[label_cols_compliment].values


from sklearn.model_selection import train_test_split

x_train_complaint, x_test_complaint, y_train_complaint, y_test_complaint = train_test_split(input_ids, labels_complaint, test_size=0.2, random_state = 62, stratify=labels_complaint)
x_train_sug, x_test_sug, y_train_sug, y_test_sug = train_test_split(input_ids, labels_sug, test_size=0.2, random_state = 62, stratify=labels_sug)
x_train_compliment, x_test_compliment, y_train_compliment, y_test_compliment = train_test_split(input_ids, labels_compliment, test_size=0.2, random_state = 62, stratify=labels_compliment)

x_train, x_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.2, random_state = 62, stratify=labels_complaint)


# Do the same for the masks.
# train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,test_size=0.2, random_state = 62, stratify=labels)


train_size = len(x_train_complaint)
test_size = len(x_test_complaint)


In [6]:
def build_models(transformer, max_len=MAX_LEN):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    models = [Model(inputs=input_word_ids, outputs=out), 
              Model(inputs=input_word_ids, outputs=out), Model(inputs=input_word_ids, outputs=out)]
    for model in models:
        model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['AUC'])
    
    return {"complaint": models[0], "suggestion": models[1], "compliment":models[2] }

In [8]:
models = build_models(transformers.TFBertModel.from_pretrained('bert-base-uncased'), max_len=MAX_LEN)

In [10]:
early_stopping = EarlyStopping( monitor='val_auc', verbose=2, patience=3,
                                mode='max', restore_best_weights=True)

model_checkpoint = ModelCheckpoint("checkpoint.h5" 
                           ,monitor='val_auc', mode='max', verbose=2, save_best_only=True, 
                            save_weights_only=True, save_freq='epoch')
            

In [11]:
# define weights for unbalanced categories
def get_weights(y_train):
    neg, pos = np.bincount(y_train.astype(int).flatten())
    total = neg + pos

    weight_for_0 = (1 / neg)*(total)/2.0 
    weight_for_1 = (1 / pos)*(total)/2.0

    return weight_for_0, weight_for_1

weight_for_0_complaint, weight_for_1_complaint = get_weights(y_train_complaint)
weight_for_0_sug, weight_for_1_sug = get_weights(y_train_sug)
weight_for_0_compliment, weight_for_1_compliment = get_weights(y_train_compliment)

In [16]:
history_complaint = models["complaint"].fit(x_train_complaint, y_train_complaint , batch_size = 32 ,validation_data=(x_test_complaint,y_test_complaint),
                     epochs = 8, class_weight={0:weight_for_0_complaint, 1:weight_for_1_complaint}, callbacks=[early_stopping,model_checkpoint])


In [17]:
def make_preds(model,x_test):
    
    model.load_weights("checkpoint.h5")
    pred = model.predict(x_test)
    
    return pred


print("Complaint","\n",classification_report(y_test_complaint, np.round(make_preds(models["complaint"],x_test_complaint)).astype(int)))