In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import inflection

from keras.models import Model,Sequential
from keras.layers import *
from keras.preprocessing.text import Tokenizer
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import backend as K
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
#!pip install keras-bert

In [None]:
#Test and Train Data
train = pd.read_csv("/Users/s0c02nj/Desktop/Innoplexus-AV/train_F3WbcTw.csv",encoding="utf-8")
test = pd.read_csv("/Users/s0c02nj/Desktop/Innoplexus-AV/test_tOlRoBf.csv",encoding="utf-8")

In [None]:
train.head(5)

In [None]:
test.head()

In [None]:
set_diff = set(test['drug']) - set(train['drug'])

In [None]:
train['sentiment'].value_counts()

In [None]:
train['drug'].unique().shape

In [None]:
test[test['drug'] == 'afainib'].shape[0]

In [None]:
dict_test = {}
for i in list(set_diff):
    dict_test[i] = test[test['drug'] == i].shape[0]

In [None]:
dict_test

Categorical Variable ---> Drugs

In [None]:
#1.Label Encoding the Categorical Varaible----TransactionType
train_copy = train.drop(['sentiment'],axis=1)
y_lab = train['sentiment']
df_join = train_copy.append(test,sort=False)

#Label enncoding combined
le = LabelEncoder()
df_join['drug'] = le.fit_transform(df_join['drug'])

#Diving test and train
df_train = df_join[0:5279]
df_test = df_join[5279:]

#Getting the cat_col
drug_train = df_train['drug']
drug_test =  df_test['drug']

#No of category_count
count_drug = len(df_join['drug'].unique())

In [None]:
y_cat = to_categorical(y_lab)

In [None]:
#df_join.head()

In [None]:
def text_preprocessing(text):
    #d = enchant.Dict("en_US")
    text= text.lower()
    text= re.sub(r'[^a-z]',' ',text)
    text= " ".join([s for s in text.split() if len(s)>2])
    #text = drop_duplicates(text)
    text= " ".join([x for x in text.split() if x not in stopwords.words('english')])
    text= " ".join([inflection.singularize(x) for x in text.split()])
    text= ' '.join(text.split())
    return(text)

In [None]:
train['text_clean'] = train['text'].apply(lambda x:text_preprocessing(x))
test['text_clean'] =  test['text'].apply(lambda x:text_preprocessing(x)

In [None]:
#Tokenizing the text
max_features =80000
tokenizer = Tokenizer(lower = True, filters='', num_words=max_features)
full_text = list(train['text_clean'].values) + list(test['text_clean'].values)
tokenizer.fit_on_texts(full_text)

In [None]:
#Tokenizing the documents---- convert to strings
train_tokenized = tokenizer.texts_to_sequences(train['text_clean'].fillna('missing'))
test_tokenized =  tokenizer.texts_to_sequences(test['text_clean'].fillna('missing'))

In [None]:
#Padding the same
max_len = 200
X_train = pad_sequences(train_tokenized, maxlen = max_len,padding='pre')
X_test =  pad_sequences(test_tokenized, maxlen = max_len,padding='pre')

In [None]:
#word_index is dictionary of the words and the sequence
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
folder_path= '/Users/s0c02nj/Downloads/glove.6B/glove.6B.50d.txt'
f=open(folder_path)
doc=f.readlines()
#****VIP
#**WORD TO VEC DICTIONARY
#Forming a dictionary-word2vec
word2vec={}
key=[]
#looping though the doc.in the doc the entire thing is saved and is separated by a space bar.
for line in doc:
    #parts contains every word separately for doc1
    parts=line.split(' ')
    #part[0] contains the word
    word=parts[0]
    key.append(word)
    #embed contains the vector
    embed=np.array(parts[1:],dtype='float32')
    #filling up the dictionary
    word2vec[word]=embed

In [None]:
#Embedding matrix creation
nb_words = min(max_features, len(word_index)+1)
embedding_matrix = np.zeros((nb_words, 50))

for word, i in word_index.items():
    #print i
    if i >= nb_words:
        continue
    if word in word2vec:
        embedding_vector = word2vec[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

#### BiLsTM_ATTention

In [None]:
def model_attn_bilstm():
    
    #Defining the input-----> Transaction
    inputs1 = Input(shape=(1,))
    layer_drug = Embedding(count_drug , 70 ,input_length=1)(inputs1)
    layer_drug = Flatten()(layer_drug)
    
    
    ################################################################################# ---> LSTM
    
    #Defining the input for the text
    
    inputs2 = Input(shape=(max_len,))
    layer =  Embedding(38146 ,50,input_length=max_len,trainable=False,weights = [embedding_matrix])(inputs2)
    layer =  Bidirectional(LSTM(64,return_sequences=True))(layer)
    
    #Attention
    activations_weights = Dense(1, activation='tanh')(layer)
    activations_weights = Flatten()(activations_weights)
    activations_weights = Activation('softmax')(activations_weights)
    activations_weights = RepeatVector(128)(activations_weights)
    activations_weights = Permute([2, 1])(activations_weights)
    activations_weighted = multiply([layer, activations_weights])
    sent_representation = Lambda(lambda x: K.sum(x, axis=-2))(activations_weighted)
    
    
    #Concatenating
    layer_sentiment = concatenate([sent_representation,layer_drug],axis=1)
    
    #Dense Layer
    layer_sentiment= Dense(30, activation='tanh')(layer_sentiment)
    
    #Output Layer
    probabilities = Dense(3,activation='softmax')(layer_sentiment)

    model = Model(inputs=[inputs1,inputs2],outputs=probabilities)
    return model

In [None]:
model_lstm = model_attn_bilstm()
model_lstm.summary()

In [None]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
model_lstm.compile(loss = "categorical_crossentropy", optimizer = Adam(0.009), metrics = [f1])

In [None]:
#Class weights
class_weight = {0: 8.,
                1: 8.,
                2: 1.}

In [None]:
history = model_lstm.fit([drug_train,
                          X_train] ,
                          y_cat, 
                          batch_size = 512, 
                          epochs = 6, 
                          validation_split=0.1,
                          class_weight=class_weight,
                          verbose = 1)

In [None]:
pred = model_lstm.predict([drug_test,
                          X_test], 
                          verbose = 1)

In [None]:
pred_class = pred.argmax(axis=-1)

In [None]:
#len(pred_class[pred_class == 1])

In [None]:
sub = pd.read_csv('/Users/s0c02nj/Desktop/Innoplexus-AV/sample_submission_i5xnIZD.csv')

In [None]:
sub['sentiment'] = pred_class

In [None]:
sub.to_csv('/Users/s0c02nj/Desktop/Innoplexus-AV/BiLSTM_vader.csv', index= False)