In [46]:
import itertools
import os

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff

In [47]:
file = "data/train_tweets.txt"
temp = []
with open(file, 'r') as data:
    for line in data:
        row = []
        line = line.replace('\t'," ")
        elem = line.strip().split(" ")
        row.append(elem[0])
        row.append(" ".join(elem[1:]))
        temp.append(row) 

In [48]:
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def text_process(text): 
    tk = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case = False) 
    text = tk.tokenize(text)
    
    text = [word for word in text if word not in STOPWORDS]
    text = ' '.join(lemmatize(word) for word in text)
    
    return text

In [49]:
def preprocess(tw):
    tw['Tweet'].replace("(@[A-Za-z0-9]+)","",regex=True,inplace=True)
    tw['Tweet'].replace("(RT|rt|FAV|fav|VIA|via)","",regex=True,inplace=True)
    tw['Tweet'] = tw['Tweet'].str.lower()
    tw["Tweet"].replace("(\\r|)\\n$", '', regex=True,inplace=True)
    tw["Tweet"].replace(r'http.?://[^\s]+[\s]?','', regex=True,inplace=True)
    tw["Tweet"].replace(r"[^a-zA-Z#]", " ", regex=True,inplace=True)
    tw['Tweet'] = tw['Tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    return tw

In [50]:
df = pd.DataFrame(temp,columns = ["User","Tweet"])
df = preprocess(df)
df['Tweet'] = df['Tweet'].apply(text_process)

In [51]:
MAX_NB_WORDS = 70000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 103423 unique tokens.


In [52]:
X = tokenizer.texts_to_sequences(df['Tweet'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (328932, 250)


In [53]:
Y = pd.get_dummies(df['User']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (328932, 9297)


In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(246699, 250) (246699, 9297)
(82233, 250) (82233, 9297)


In [66]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.1))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(9297, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 250, 300)          21000000  
_________________________________________________________________
spatial_dropout1d_26 (Spatia (None, 250, 300)          0         
_________________________________________________________________
lstm_26 (LSTM)               (None, 100)               160400    
_________________________________________________________________
dense_26 (Dense)             (None, 9297)              938997    
Total params: 22,099,397
Trainable params: 22,099,397
Non-trainable params: 0
_________________________________________________________________
None


In [67]:
epochs = 30
batch_size = 128

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 222029 samples, validate on 24670 samples
Epoch 1/30
  2432/222029 [..............................] - ETA: 33:53 - loss: 9.1365 - acc: 4.1118e-04

KeyboardInterrupt: 

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();