Required libaries

In [46]:
#dataframe
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

#nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [47]:
nltk.download('stopwords') #There is a libary for stopwords in Indonesian

[nltk_data] Downloading package stopwords to /Users/shaan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing strategy 

tokenizing process = strings > token > integers > vectors 
.padding is done so each sequence are same length



## Workflow of Model
module
load the data
preprocess data
build
evaluate the model
prediction with new dataset


## Shape of model 

Use Keras API

* i = Input(Shape=(T,)) # T = length of sequence
* x = Embedding(V, D)(i) # V = Vocab Size, D = Embedding dimensionality
* x = LTSM(M)(x) # M = Hidden vector dimensionality
* x = GlobalMaxPooling1D()(x)
* X = Dense(K, activation ='sigmoid') #K = no. of output classes. Sigmoid as its binary


In [48]:
dataset_columns = ["target", "ids", "date", "flag", "user", "text"]
dataset_encoding = "ISO-8859-1"

In [49]:
df = pd.read_csv('/Users/shaan/Desktop/datasets/training.1600000.processed.noemoticon.csv', encoding = dataset_encoding, names=dataset_columns)


In [50]:
print("Dataset size:", len(df))

Dataset size: 1600000


In [51]:
df.head(5)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [52]:
df.tail(5)

Unnamed: 0,target,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [53]:
df.drop(['ids','date','flag','user'],axis = 1,inplace=True)

In [54]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [55]:
df['target'].value_counts()

4    800000
0    800000
Name: target, dtype: int64

No neutral tweets 

In [56]:
#preprocess the data wth regex

def text_cleaning(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text)     # removing @mentions
  text = re.sub(r'@[A-Za-zA-Z0-9]+', '', text)  # removing @mentions 
  text = re.sub(r'@[A-Za-z]+', '', text)        # removing @mentions
  text = re.sub(r'@[-)]+', '', text)            # removing @mentions
  text = re.sub(r'#', '', text )                # removing '#' sign
  text = re.sub(r'RT[\s]+', '', text)           # removing RT
  text = re.sub(r'https?\/\/\S+', '', text)     # removing the hyper link
  text = re.sub(r'&[a-z;]+', '', text)          # removing '&gt;'

  return text

In [57]:
df['text'] = df['text'].apply(text_cleaning)

In [58]:
df.head(10)

Unnamed: 0,target,text
0,0,"http://twitpic.com/2y1zl - Awww, that's a bum..."
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball. Managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."
5,0,not the whole crew
6,0,Need a hug
7,0,"hey long time no see! Yes.. Rains a bit ,onl..."
8,0,_K nope they didn't have it
9,0,que me muera ?


certain tweets are still not cleaned yet so lets try to clean the again with a different func

In [72]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")


def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [73]:
df.text = df.text.apply(lambda x: preprocess(x))

In [74]:
df.head(10)

Unnamed: 0,target,text
0,0,awww bummer shoulda got david carr third day
1,0,upset update facebook texting might cry result...
2,0,dived many times ball managed save 50 rest go ...
3,0,whole body feels itchy like fire
4,0,behaving mad see
5,0,whole crew
6,0,need hug
7,0,hey long time see yes rains bit bit lol fine t...
8,0,k nope
9,0,que muera


In [75]:
df.tail(10)

Unnamed: 0,target,text
1599990,4,wooooo xbox back
1599991,4,mmmm sounds absolutely perfect schedule full t...
1599992,4,recovering long weekend
1599993,4,gritboys
1599994,4,forster yeah work better waiting end wonder ti...
1599995,4,woke school best feeling ever
1599996,4,thewdb com cool hear old walt interviews
1599997,4,ready mojo makeover ask details
1599998,4,happy 38th birthday boo alll time tupac amaru ...
1599999,4,happy charitytuesday


much better, tweets have been cleaned ! 

In [76]:
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, df['target'].values, test_size=0.30)

In [85]:
print("TRAIN size:", len(X_train))
print("TEST size:", len(X_test))

TRAIN size: 1120000
TEST size: 480000


In [86]:
#insert WORD2Vec Indonesia to get root words, it will help group simillar words together

In [87]:
#initiate tokenizer 
max_vocab = 20000000 #just use some big number 
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train)

In [88]:
#checking word index
wordidx = tokenizer.word_index
V = len(wordidx)
print('Vocab size = ', V)

Vocab size =  232164


In [89]:
# converting tran and test sentences into sequences
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)
print('Training sequence: ', train_seq[0])
print('Testing sequence: ', test_seq[0])

Training sequence:  [162, 522, 18, 713, 439, 551, 28, 164, 12797, 335, 58, 67, 3915]
Testing sequence:  [516, 517, 159, 325, 68, 132, 83, 19, 1680]


In [90]:
# padding the sequences to get equal length sequence because its conventional to use same size sequences
# padding the traing sequence
pad_train = pad_sequences(train_seq)
T = pad_train.shape[1]
print('The length of training sequence is: ', T)


The length of training sequence is:  37


In [91]:
# padding the test sequence
pad_test = pad_sequences(test_seq, maxlen=T)
print('The length of testing sequence is: ', pad_test.shape[1])


The length of testing sequence is:  37


In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [102]:
# building the model

from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model

D = 20 
M = 15

i = Input (shape=(T, ))   
x = Embedding(V+1, D)(i)    # V+1 because the indexing of the words in vocab (V) start from 1 not 0
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(32, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i,x)
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 37)]              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 37, 20)            4643300   
_________________________________________________________________
lstm_4 (LSTM)                (None, 37, 15)            2160      
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 15)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                512       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 4,646,005
Trainable params: 4,646,005
Non-trainable params: 0
_________________________________________________

In [93]:
# compiling the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [96]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [101]:
print(len(pad_train))
print(len(y_train))

1120000
1120000


In [98]:
# training the model
r = model.fit(pad_train, y_train, batch_size = 1024, epochs=2,validation_split=0.1,verbose=1,callbacks=callbacks)

Epoch 1/2

IndexError: list index out of range