In [2]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
#from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

Using Theano backend.


In [3]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [4]:
tweets = pd.read_excel("text_classification_dataset.xlsx")

In [5]:
macronum=sorted(set(tweets['type']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

tweets['type']=tweets['type'].apply(fun)

In [6]:
tweets['type'].unique()

array([3, 0, 1, 2])

In [7]:
labels = []
for idx in tweets['type']:
    labels.append(idx)

In [8]:
texts = []
for tweet in tweets['text']:
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) 
    print (tweet)
    texts.append(tweet)

AT_USER AT_USER never knew having 1 or 2 followers had anything to do with reality...malinga has never been s… URL
myca magical moments:

september, 2011: sham chotoo of the bowie boys and girls club joins maryland youth cricket a… URL
the current state of last year's AT_USER finalists - 
AT_USER - p10 - w9 l1
AT_USER - p10 - w1 l9 

😲🙃😵😵😵 cricket bbl09
AT_USER why did you bring a cricket...
babar azam only pakistani included in the icc odi team of the year. if we had more test like other nations, babar s… URL
rt AT_USER indvaus 

rishabh pant was ruled out of india’s second odi against australia as the young wicket-keeper batsman has no…
rt AT_USER AT_USER &amp; idaho company orchestra provisions team up on new fry combo with "cricket" seasonings. URL
rt AT_USER i'm surprised a lot of people haven't seen the greatest documentary ever made about cricket. please undo this and watch '…
rt AT_USER the icc odi cricketer of the year award goes to none other than the hitman for his stupendou

In [8]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

print('Number of Unique Tokens',len(word_index))

('Number of Unique Tokens', 5817)


In [9]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

('Shape of Data Tensor:', (1162, 1000))
('Shape of Label Tensor:', (1162, 4))


In [10]:
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

TypeError: 'encoding' is an invalid keyword argument for this function

In [11]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [16]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(len(macronum), activation='softmax')(l_lstm)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Bidirectional LSTM")
model.summary()

Bidirectional LSTM
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         581800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 804       
Total params: 743,404
Trainable params: 743,404
Non-trainable params: 0
_________________________________________________________________


In [17]:
cp=ModelCheckpoint('model_rnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=3, batch_size=2,callbacks=[cp])

Train on 930 samples, validate on 232 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.23276, saving model to model_rnn.hdf5
Epoch 2/3

Epoch 00002: val_acc improved from 0.23276 to 0.34914, saving model to model_rnn.hdf5
Epoch 3/3

Epoch 00003: val_acc improved from 0.34914 to 0.52586, saving model to model_rnn.hdf5
