In [1]:
## load modules
import sys
import nltk
import spacy
nlp = spacy.load('en_core_web_lg')
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import pickle
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [2]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [3]:
import os; os.getcwd()

'/home/DHE/ss1043/projects/HDS-MyChart/emergency'

In [4]:
from pandas import ExcelWriter
from pandas import ExcelFile


In [5]:
#df = pd.read_excel('master_label_data_classifiers_25feb_20 - J_Wosik.xlsx', sheet_name=0)

In [7]:
df.columns

Index(['Unnamed: 0_x', 'DEPARTMENT_NAME', 'ENCOUNTER_DATE', 'ENCOUNTER_ID',
       'ENCOUNTER_TYPE', 'END_DATE', 'FINANCIAL_CLASS', 'INDICATOR',
       'MESSAGE_DATE', 'MESSAGE_ID', 'MESSAGE_TEXT', 'MESSAGE_TO_FROM_PATIENT',
       'MESSAGE_TO_FROM_PROVIDER', 'MESSAGE_TYPE', 'MRN', 'PAT_ID', 'PAT_NAME',
       'PROVIDER_TYPE', 'START_DATE', 'Clean_Message', 'cluster_label',
       'binary_cluster', 'doc_binary', 'doc_topic_perct', 'dominant_topic_ids',
       'cluster_topic_interp', 'cluster', 'Urgency', '# of questions',
       'statin related', 'Bleeding', 'Thanks/best wishes', 'ECG question',
       'Stopping Meds Prior to Surgery', 'Requests for referrals',
       'Simple Refill', 'Includes vitals', 'INR message'],
      dtype='object')

In [8]:
df.Urgency.value_counts()

 0.0    955
-1.0    631
 1.0    170
Name: Urgency, dtype: int64

In [9]:
df[df.Urgency.isin({0.0, 1.0,-1.0})].Urgency.value_counts()

-1.0    1106
 0.0     206
 1.0     114
Name: Urgency, dtype: int64

In [9]:
labeled_data = df[df.Urgency.isin({0.0, 1.0,-1.0})]

In [10]:
labeled_data.shape

(1756, 38)

In [5]:
import json

In [6]:
## load data from json
with open('original_labelled_data.json', 'r') as jf:
    data=json.load(jf)

In [7]:
data.keys()

dict_keys(['sentences', 'labels'])

In [8]:
sentences,labels=data['sentences'], data['labels']

## text classification

In [9]:
import sklearn
from sklearn.model_selection import train_test_split
from collections import Counter

In [11]:
[it for it,lab in zip(sentences,labels) if lab==2]

In [11]:
## split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    sentences, labels, test_size=0.2, random_state=2020, stratify=labels)

In [12]:
Counter(y_train).most_common()

[(1, 764), (0, 504), (2, 136)]

In [13]:
Counter(labels).most_common()

[(1, 955), (0, 631), (2, 170)]

### traditional machine methods
#### text preprocessing, feature engineering, classifier

Here’s the complete script which performs the aforementioned data pre-processing steps, you can always add or remove steps which best suits the data set you are dealing with:

    Remove Blank rows in Data, if any
    Change all the text to lower case
    Word Tokenization
    Remove Stop words
    Remove Non-alpha text
    Word Lemmatization

In [17]:
full_data = pd.DataFrame.from_dict({'text':sentences, 'label':labels})

### preprocessing full dataset

In [19]:
# Step - a : Remove blank rows if any.
full_data['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
full_data['text'] = [entry.lower() for entry in full_data['text']]
# Step - c : Tokenization : In this each entry in the full_data will be broken into set of words
full_data['text']= [word_tokenize(entry) for entry in full_data['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(full_data['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    full_data.loc[index,'text_final'] = str(Final_words)

### split train/test data

In [21]:
## train test datasets split, use stratify for imbalanced data
Train_X, Test_X, Train_Y, Test_Y = train_test_split(full_data['text_final'],
                        full_data['label'],test_size=0.2,random_state=2020, stratify=full_data['label'])

In [25]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [22]:
Counter(Train_Y).most_common()

[(1, 764), (0, 504), (2, 136)]

### feature engineering

In [23]:
Tfidf_vect = TfidfVectorizer(max_features=1000)
Tfidf_vect.fit(full_data['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [25]:
## vocabulary
print(Tfidf_vect.vocabulary_)
## features
print(Train_X_Tfidf)

  (0, 873)	1.0
  (1, 873)	0.06572678903746632
  (1, 861)	0.1624260885519938
  (1, 835)	0.18206980132421033
  (1, 801)	0.2872862952725449
  (1, 754)	0.11124812693897355
  (1, 725)	0.11602391120370734
  (1, 716)	0.1624260885519938
  (1, 699)	0.18645716173186583
  (1, 673)	0.4117504337853171
  (1, 609)	0.13511053007674134
  (1, 581)	0.22978395987928016
  (1, 556)	0.11952149442384391
  (1, 542)	0.17632776397801436
  (1, 528)	0.15050837089038255
  (1, 467)	0.11331366088211951
  (1, 459)	0.22978395987928016
  (1, 389)	0.12966295679654294
  (1, 370)	0.11971499880895857
  (1, 334)	0.23809594790712943
  (1, 271)	0.20041430926101428
  (1, 204)	0.23809594790712943
  (1, 193)	0.1286629098780136
  (1, 147)	0.2229925711594858
  (1, 107)	0.12170742613351951
  :	:
  (1399, 184)	0.3019634706485925
  (1399, 176)	0.2130205124755125
  (1399, 148)	0.23644238296295578
  (1399, 129)	0.3543534206332035
  (1399, 128)	0.3543534206332035
  (1399, 127)	0.3224656033657172
  (1399, 58)	0.25160947379204723
  (1400, 

### classifiers

In [12]:
from sklearn.metrics import classification_report

In [27]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB(alpha=5.,fit_prior=False)
Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  85.79545454545455


In [167]:
print(classification_report(Test_Y, predictions_NB, digits=3))

              precision    recall  f1-score   support

           0      0.976     0.969     0.972       127
           1      0.841     0.916     0.877       191
           2      0.222     0.118     0.154        34

    accuracy                          0.858       352
   macro avg      0.680     0.667     0.668       352
weighted avg      0.830     0.858     0.842       352



In [82]:
# Classifier - Algorithm - SVM -- linear kernel
# fit the training dataset on the classifier
SVM = svm.SVC(C=1., kernel='linear', degree=2, gamma='auto', random_state=82)#, class_weight='balanced')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  89.48863636363636


In [168]:
print(classification_report(Test_Y, predictions_SVM, digits=3))

              precision    recall  f1-score   support

           0      0.984     0.976     0.980       127
           1      0.867     0.953     0.908       191
           2      0.500     0.235     0.320        34

    accuracy                          0.892       352
   macro avg      0.784     0.722     0.736       352
weighted avg      0.874     0.892     0.877       352



In [62]:
# Classifier - Algorithm - SVM -- RBF kernel
# fit the training dataset on the classifier
SVM = svm.SVC(C=10., kernel='rbf', gamma=1.)#, class_weight='balanced')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
#print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

In [63]:
print(classification_report(Test_Y, predictions_SVM, target_names=['Medium','Non-urgent',
                                                                   'Urgent'], digits=3))

              precision    recall  f1-score   support

      Medium      1.000     0.961     0.980       127
  Non-urgent      0.862     0.979     0.917       191
      Urgent      0.615     0.235     0.340        34

    accuracy                          0.901       352
   macro avg      0.826     0.725     0.746       352
weighted avg      0.888     0.901     0.884       352



### mini-batch SVM

In [85]:
from sklearn import linear_model

In [148]:
clf = linear_model.SGDClassifier(loss='hinge',max_iter=500, tol=1e-2, alpha=6.e-4,random_state=72)
clf.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = clf.predict(Test_X_Tfidf)

In [161]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

In [13]:
from collections import defaultdict

In [181]:
res = defaultdict(list)

In [184]:
tmp['macro avg']

{'f1-score': 0.7477261102069286,
 'precision': 0.7947840486540176,
 'recall': 0.7313211387535598,
 'support': 352}

In [185]:
for seed in [42,52,62,72,82]:
    clf0 = SGDClassifier(loss='hinge',max_iter=500, tol=1e-2, alpha=1.e-3,
                         random_state=seed)
    calibrated_clf = CalibratedClassifierCV(clf0, cv=5, method='sigmoid')
    calibrated_clf.fit(Train_X_Tfidf,Train_Y)
    predictions_SVM = calibrated_clf.predict(Test_X_Tfidf) 
    ## temporary result
    tmp = classification_report(Test_Y, predictions_SVM, digits=3,
                            output_dict=True)
    ## append to list
    res['accuracy'].append(tmp['accuracy'])
    res['f1-score'].append(tmp['macro avg']['f1-score'])
    res['precision'].append(tmp['macro avg']['precision'])
    res['recall'].append(tmp['macro avg']['recall'])
    

In [189]:
np.mean(res['accuracy']),np.std(res['accuracy'])

(0.8948863636363636, 0.0017967486705501962)

In [190]:
np.mean(res['f1-score']),np.std(res['f1-score'])

(0.7476424039899487, 0.007291395439872533)

In [191]:
np.mean(res['precision']),np.std(res['precision'])

(0.7945688418908233, 0.0067396360821895315)

In [192]:
np.mean(res['recall']),np.std(res['recall'])

(0.7313211387535598, 0.0062005444317027175)

### BioBERT  result

In [193]:
## precision, recall, f1, accuracy
res=np.array([[0.780,0.772,0.776,0.892],
              [0.791,0.765,0.775,0.895],
              [0.778,0.756,0.764,.892],
              [0.774,0.747,0.757,.892],
              [0.747,0.751,0.749,.872]
             ])

In [196]:
res.mean(axis=0)

array([0.774 , 0.7582, 0.7642, 0.8886])

In [197]:
res.std(axis=0)

array([0.01462874, 0.00915205, 0.01038075, 0.00838093])

In [200]:
## precision, recall, f1, accuracy -- label embedding BERT
res=np.array([[0.790,0.788,0.789,0.892],
              [0.779,0.801,0.788,0.878],
              [0.766,0.776,0.770,.875],
              [0.793,0.817,0.803,.889],
              [0.790,0.804,0.796,.892]
             ])

In [201]:
res.mean(axis=0)

array([0.7836, 0.7972, 0.7892, 0.8852])

In [202]:
res.std(axis=0)

array([0.01001199, 0.01404849, 0.01101635, 0.00724983])

In [159]:
prob_svm = calibrated_clf.predict_proba(Test_X_Tfidf)

In [170]:
print(classification_report(Test_Y, predictions_SVM, digits=3))

              precision    recall  f1-score   support

           0      0.984     0.976     0.980       127
           1      0.867     0.953     0.908       191
           2      0.500     0.235     0.320        34

    accuracy                          0.892       352
   macro avg      0.784     0.722     0.736       352
weighted avg      0.874     0.892     0.877       352



In [177]:
tmp=classification_report(Test_Y, predictions_SVM, digits=3,
                            output_dict=True)


In [179]:
tmp['accuracy']

0.8948863636363636

In [178]:
tmp

{'0': {'f1-score': 0.9802371541501976,
  'precision': 0.9841269841269841,
  'recall': 0.9763779527559056,
  'support': 127},
 '1': {'f1-score': 0.91,
  'precision': 0.8708133971291866,
  'recall': 0.9528795811518325,
  'support': 191},
 '2': {'f1-score': 0.35294117647058826,
  'precision': 0.5294117647058824,
  'recall': 0.2647058823529412,
  'support': 34},
 'accuracy': 0.8948863636363636,
 'macro avg': {'f1-score': 0.7477261102069286,
  'precision': 0.7947840486540176,
  'recall': 0.7313211387535598,
  'support': 352},
 'weighted avg': {'f1-score': 0.8815344277757814,
  'precision': 0.8787201302153456,
  'recall': 0.8948863636363636,
  'support': 352}}

In [174]:
help(classification_report)

Help on function classification_report in module sklearn.metrics.classification:

classification_report(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False)
    Build a text report showing the main classification metrics
    
    Read more in the :ref:`User Guide <classification_report>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : array, shape = [n_labels]
        Optional list of label indices to include in the report.
    
    target_names : list of strings
        Optional display names matching the labels (same order).
    
    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.
    
    digits : int
        Number of digits for formatting output floating po

## Deep learning method

### fasttext

In [18]:
import keras

Using TensorFlow backend.


In [19]:
preprocess=keras.preprocessing.text.Tokenizer(num_words=5000, 
                                              lower=True, split=' ', char_level=False, 
                                               oov_token=True, document_count=0)


In [27]:
preprocess.fit_on_texts(sentences)

In [28]:
txt_ids=preprocess.texts_to_sequences(sentences)

In [22]:
labels[0]

0

In [30]:
## train test datasets split, use stratify for imbalanced data
train_x, test_x, train_y, test_y = train_test_split(txt_ids,
                        labels,test_size=0.2,random_state=2020, stratify=labels)

In [31]:


from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences




In [33]:
from keras.utils import to_categorical

In [34]:
x_train,x_test,y_train,y_test=train_x,test_x,\
to_categorical(train_y),to_categorical(test_y)

In [35]:
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(
    np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
    np.mean(list(map(len, x_test)), dtype=int)))


1404 train sequences
352 test sequences
Average train sequence length: 61
Average test sequence length: 62


In [36]:
# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 2
max_features = 2000
maxlen = 100
batch_size = 32
embedding_dims = 50
epochs = 5


In [37]:
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)), dtype=int)))

Adding 2-gram features
Average train sequence length: 121
Average test sequence length: 100


In [38]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')


Pad sequences (samples x time)
x_train shape: (1404, 100)
x_test shape: (352, 100)
Build model...


In [41]:
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [42]:
model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test))


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1404 samples, validate on 352 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f1b086cee10>

In [49]:
from sklearn.metrics import classification_report

In [51]:
y_test

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [52]:
y_predict0=model.predict(x_test)
y_predict=np.argmax(y_predict0,axis=1)
print(classification_report(np.argmax(y_test,axis=1), y_predict, target_names=['Non-urgent',
                                                                   'Medium','Urgent']))

              precision    recall  f1-score   support

  Non-urgent       0.80      0.99      0.89       127
      Medium       0.83      0.84      0.83       191
      Urgent       0.00      0.00      0.00        34

    accuracy                           0.82       352
   macro avg       0.54      0.61      0.57       352
weighted avg       0.74      0.82      0.77       352



  'precision', 'predicted', average, warn_for)


## CNN classifier

In [14]:
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, AveragePooling2D, Conv2D,Reshape
from keras.models import Sequential,Model
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups
from keras.layers.merge import Concatenate


Using TensorFlow backend.


In [15]:
## train test datasets split, use stratify for imbalanced data
train_x, test_x, train_y, test_y = train_test_split(sentences,
                        labels,test_size=0.2,random_state=2020, stratify=labels)
# Encoder = LabelEncoder()
# train_y = Encoder.fit_transform(train_y)
# test_y = Encoder.fit_transform(test_y)

In [19]:
w2v_file='../glove.6B/glove.6B.100d.txt'
f = open(w2v_file, encoding="utf8")

In [20]:
w2v_file='../glove.6B/glove.6B.100d.txt'
def loadData_Tokenizer(X_train, X_test, MAX_NB_WORDS=75000,
                       MAX_SEQUENCE_LENGTH=500, w2v_file=w2v_file):
    """
    use glove embedding
    """
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open(w2v_file, encoding="utf8") ## GloVe file which could be download https://nlp.stanford.edu/projects/glove/
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)


In [21]:
def Build_Model_CNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100, 
                         dropout=0.5):

    """
        def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
        word_index in word index ,
        embeddings_index is embeddings index, look at data_helper.py
        nClasses is number of classes,
        MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
        EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
    """

    model = Sequential()
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    # applying a more complex convolutional approach
    convs = []
    filter_sizes = []
    layer = 5
    print("Filter  ",layer)
    for fl in range(0,layer):
        filter_sizes.append((fl+2,fl+2))

    node = 128
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    emb = Reshape((500,10, 10), input_shape=(500,100))(embedded_sequences)

    for fsz in filter_sizes:
        l_conv = Conv2D(node, padding="same", kernel_size=fsz, activation='relu')(emb)
        l_pool = AveragePooling2D(pool_size=(5,1), padding="same")(l_conv)
        #l_pool = Dropout(0.25)(l_pool)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)
    l_cov1 = Conv2D(node, (5,5), padding="same", activation='relu')(l_merge)
    l_cov1 = AveragePooling2D(pool_size=(5,2), padding="same")(l_cov1)
    l_cov2 = Conv2D(node, (5,5), padding="same", activation='relu')(l_cov1)
    l_pool2 = AveragePooling2D(pool_size=(5,2), padding="same")(l_cov2)
    l_cov2 = Dropout(dropout)(l_pool2)
    l_flat = Flatten()(l_cov2)
    l_dense = Dense(128, activation='relu')(l_flat)
    l_dense = Dropout(dropout)(l_dense)

    preds = Dense(nclasses, activation='softmax')(l_dense)
    model = Model(sequence_input, preds)

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [28]:
model = Sequential()
help(model.compile)

Help on method compile in module keras.engine.training:

compile(optimizer, loss=None, metrics=None, loss_weights=None, sample_weight_mode=None, weighted_metrics=None, target_tensors=None, **kwargs) method of keras.engine.sequential.Sequential instance
    Configures the model for training.
    
    # Arguments
        optimizer: String (name of optimizer) or optimizer instance.
            See [optimizers](/optimizers).
        loss: String (name of objective function) or objective function or
            `Loss` instance. See [losses](/losses).
            If the model has multiple outputs, you can use a different loss
            on each output by passing a dictionary or a list of losses.
            The loss value that will be minimized by the model
            will then be the sum of all individual losses.
        metrics: List of metrics to be evaluated by the model
            during training and testing. Typically you will use
            `metrics=['accuracy']`. To specify diffe

In [29]:
X_train_Glove

array([[   0,    0,    0, ...,    0,   18,    7],
       [   0,    0,    0, ..., 1227, 1228, 2872],
       [   0,    0,    0, ...,    7, 2873, 3946],
       ...,
       [   0,    0,    0, ...,  201,   38,    7],
       [   0,    0,    0, ...,    0,   18,    7],
       [   0,    0,    0, ...,   20,  290, 1893]], dtype=int32)

In [22]:
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(train_x,test_x)


model_CNN = Build_Model_CNN_Text(word_index,embeddings_index, 20)


model_CNN.summary()


Found 7623 unique tokens.
(1756, 500)
Total 400000 word vectors.
Filter   5
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 100)     762400      input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 500, 10, 10)  0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 500, 10, 128) 5248        reshape_1[0][0]                  
________________

In [32]:
model_CNN.fit(X_train_Glove, train_y,
                              validation_data=(X_test_Glove, test_y),
                              epochs=4,
                              batch_size=16,
                              verbose=2)



Train on 1404 samples, validate on 352 samples
Epoch 1/4
 - 283s - loss: 0.0022 - accuracy: 0.9993 - val_loss: 2.3047 - val_accuracy: 0.8097
Epoch 2/4
 - 275s - loss: 0.0818 - accuracy: 0.9815 - val_loss: 1.4996 - val_accuracy: 0.8494
Epoch 3/4
 - 278s - loss: 0.0238 - accuracy: 0.9922 - val_loss: 2.8516 - val_accuracy: 0.8636
Epoch 4/4
 - 295s - loss: 0.0043 - accuracy: 0.9986 - val_loss: 2.4735 - val_accuracy: 0.8409


<keras.callbacks.callbacks.History at 0x7f54bc3d1ef0>

In [33]:
predicted = model_CNN.predict(X_test_Glove)

predicted = np.argmax(predicted, axis=1)


print(metrics.classification_report(test_y, predicted))


              precision    recall  f1-score   support

           0       0.93      0.98      0.95       127
           1       0.88      0.84      0.86       191
           2       0.33      0.35      0.34        34

    accuracy                           0.84       352
   macro avg       0.71      0.72      0.72       352
weighted avg       0.84      0.84      0.84       352



In [310]:
Counter(labels)

Counter({-1.0: 1106, 0.0: 206, 1.0: 114})