In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'


def import_tweets(filename, header = None):
    #import data from csv file via pandas library
    tweet_dataset = pd.read_csv(filename,header = header, encoding='Latin-1', low_memory=False, index_col=False)
    #the column names are based on sentiment140 dataset provided on kaggle
    tweet_dataset.columns = ['target','id','date','flag','user','text']
    #delete 3 columns: flags,id,user, as they are not required for analysi
    for i in ['flag','id','user','date']: del tweet_dataset[i] # or tweet_dataset = tweet_dataset.drop(["id","user","date","user"], axis = 1)
    #in sentiment140 dataset, positive = 4, negative = 0; So we change positive to 1
    tweet_dataset.target = tweet_dataset.target.replace(4,1)
    return tweet_dataset

def preprocess_tweet(tweet):
	#Preprocess the text in a single tweet
	#arguments: tweet = a single tweet in form of string
	#convert the tweet to lower case

	str(tweet)
	tweet.lower()
	#convert all urls to sting "URL "
	tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
	#convert all @username to "AT_USER "
	tweet = re.sub('@[^\s]+','AT_USER', tweet)
	#correct all multiple white spaces to a single white space
	tweet = re.sub('[\s]+', ' ', tweet)
	#convert "#topic" to just "topic"
	tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
	return tweet




In [3]:
my_df = import_tweets("training_1600000_processed_noemoticon.csv")
print("File read")

my_df['text'] = my_df['text'].apply(preprocess_tweet)
print("Preprocessing done")

# csv = 'clean_tweet.csv'
# my_df = pd.read_csv(csv,index_col=0)
my_df.head()

File read


Preprocessing done


In [4]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
target    1600000 non-null int64
text      1600000 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
x = my_df.text
y = my_df.target

In [14]:
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [15]:
print( "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print ("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 1440000 entries with 49.97% negative, 50.03% positive
Validation set has total 80000 entries with 50.27% negative, 49.73% positive
Test set has total 80000 entries with 50.18% negative, 49.82% positive


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec1 = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
# tvec1 = TfidfVectorizer(sublinear_tf=True,stop_words="english")  # we need to give proper stopwords list for better performance

# tvec1.fit(x_train)
x_train_tfidf = tvec1.fit_transform(x_train)
print("Done")

Done


In [18]:
print("Done1")
x_validation_tfidf = tvec1.transform(x_validation).toarray()
print("Done2")

Done1


MemoryError: 

In [16]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_tfidf, y_train)

CPU times: user 37.6 s, sys: 1.34 s, total: 38.9 s
Wall time: 39.4 s


In [17]:
clf.score(x_validation_tfidf, y_validation)

0.82919799498746871

In [26]:
clf.score(x_train_tfidf, y_train)

0.84193317810009349

I will first start by loading required dependencies. In order to run Keras with TensorFlow backend, you need to install both TensorFlow and Keras.

In [33]:
seed = 7
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

The structure of below NN model has 100,000 nodes in the input layer, then 64 nodes in a hidden layer with Relu activation function applied, then finally one output layer with sigmoid activation function applied. There are different types of optimizing techniques for neural networks, and different loss function you can define with the model. Below model uses ADAM optimizing, and binary cross entropy loss.

ADAM is an optimization algorithm for updating the parameters and minimizing the cost of the neural network, which is proved to be very effective. It combines two methods of optimization: RMSProp, Momentum. Again, I will focus on sharing the result I got from my implementation, but if you want to understand properly how ADAM works, I strongly recommend the "deeplearning.ai" course by Andrew Ng. He explains the complex concept of neural network in a very intuitive way.

Before I feed the data and train the model, I need to deal with one more thing. Keras NN model cannot handle sparse matrix directly. The data has to be dense array or matrix, but transforming the whole training data Tfidf vectors of 1.5 million to dense array won't fit into my RAM. So I had to define a function, which generates iterable generator object so that it can be fed to NN model. Note that the output should be a generator class object rather than arrays, this can be achieved by using "yield" instead of "return". 

In [18]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            counter=0

In [20]:
%%time
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=100000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit_generator(generator=batch_generator(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 13h 52min 45s, sys: 3h 27min 53s, total: 17h 20min 38s
Wall time: 6h 59min 24s


It looks like the model had the best validation accuracy after 2 epochs, and after that, it fails to generalize so validation accuracy slowly decreases, while training accuracy increases. But if you remember the result I got from logistic regression (train accuracy: 84.19%, validation accuracy: 82.91%), you can see that the above neural network failed to outperform logistic regression in terms of validation.

Let's see if normalizing inputs have any effect on the performance.

In [42]:
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(x_train_tfidf)
x_train_tfidf_norm = norm.transform(x_train_tfidf)
x_validation_tfidf_norm = norm.transform(x_validation_tfidf)

In [46]:
%%time
model_n = Sequential()
model_n.add(Dense(64, activation='relu', input_dim=100000))
model_n.add(Dense(1, activation='sigmoid'))
model_n.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_n.fit_generator(generator=batch_generator(x_train_tfidf_norm, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf_norm, y_validation),
                    steps_per_epoch=x_train_tfidf_norm.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 14h 4min 6s, sys: 3h 42min 59s, total: 17h 47min 6s
Wall time: 7h 17min 31s


By the look of the result, normalizing seems to have almost no effect on the performance. And it is at this point I realized that Tfidf is already normalized by the way it is calculated. TF (Term Frequency) in Tfidf is not absolute frequency but relative frequency, and by multiplying IDF (Inverse Document Frequency) to the relative term frequency value, it further normalizes the value in a cross-document manner.

If the problem of the model is a poor generalization, then there is another thing I can add to the model. Even though the neural network is a very powerful model, sometimes overfitting to the training data can be a problem. Dropout is a technique that addresses this problem. If you are familiar with the concept of ensemble model in machine learning, dropout can also be seen in the same vein. According to the research paper "Improving neural networks by preventing
co-adaptation of feature detectors" by Hinton et al. (2012), "A good way to reduce the error on the test set is to
average the predictions produced by a very large number of different networks. The standard way to do this is to train many separate networks and then to apply each of these networks to the test data, but this is computationally expensive during both training and testing. Random dropout makes it possible to train a huge number of different networks in a reasonable time." https://arxiv.org/pdf/1207.0580.pdf

Dropout is simulating as if we train many different networks and averaging them by randomly omitting hidden nodes with a certain probability throughout the training process. With Keras, this can be easily implemented just by adding one line to your model architecture. Let's see how the model performance changes with 20% dropout rate. (*I will gather all the results and present them with a table at the end.)

In [21]:
model1 = Sequential()
model1.add(Dense(64, activation='relu', input_dim=100000))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model1.fit_generator(generator=batch_generator(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17eb65290>

Through 5 epochs, the train set accuracy didn't get as high as the model without dropout, but validation accuracy didn't drop as low as the previous model. Even though the dropout added some generalization to the model, but the validation accuracy is still underperforming compared to logistic regression result.

There is another method I can try to prevent overfitting. By presenting the data in the same order for every epoch, there's a possibility that the model learns the parameters which also includes the noise of the training data, which eventually leads to overfitting. This can be improved by shuffling the order of the data we feed the model. Below I added shuffling to the batch generator function and tried with the same model structure and compared the result.

In [23]:
def batch_generator_shuffle(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    np.random.shuffle(index)
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            np.random.shuffle(index)
            counter=0

In [24]:
%%time
model_s = Sequential()
model_s.add(Dense(64, activation='relu', input_dim=100000))
model_s.add(Dense(1, activation='sigmoid'))
model_s.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 14h 14min 27s, sys: 3h 54min 36s, total: 18h 9min 4s
Wall time: 7h 26min 58s


The same model with non-shuffled training data had training accuracy of 87.36%, and validation accuracy of 79.78%. With shuffling, training accuracy decreased to 84.80% but the validation accuracy after 5 epochs has increased to 82.61%. It seems like the shuffling did improve the model's performance on the validation set. And another thing I noticed is that with or without shuffling also for both with or without dropout, validation accuracy tends to peak after 2 epochs, and gradually decrease afterwards.

Below I tried the same model with 20% dropout with shuffled data, this time only 2 epochs.

In [25]:
%%time
model_s_1 = Sequential()
model_s_1.add(Dense(64, activation='relu', input_dim=100000))
model_s_1.add(Dropout(0.2))
model_s_1.add(Dense(1, activation='sigmoid'))
model_s_1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_1.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/2
Epoch 2/2
CPU times: user 5h 36min 14s, sys: 1h 33min 7s, total: 7h 9min 22s
Wall time: 2h 54min 55s


As same as the non-shuffled data, both the training accuracy and validation accuracy slightly dropped.

As I was going through the "deeplearning.ai" course by Andrew Ng, he states that the first thing he would try to improve a neural network model is tweaking the learning rate. I decided to follow his advice and try different learning rates with the model. Please note that except for the learning rate, the parameter for 'beta_1', 'beta_2', and 'epsilon' are set to the default values presented by the original paper "ADAM: A Method for Stochastic Optimization" by Kingma and Ba (2015). https://arxiv.org/pdf/1412.6980.pdf

In [36]:
%%time
import keras
custom_adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_2 = Sequential()
model_testing_2.add(Dense(64, activation='relu', input_dim=100000))
model_testing_2.add(Dense(1, activation='sigmoid'))
model_testing_2.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=2, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/2
Epoch 2/2
CPU times: user 5h 39min 43s, sys: 1h 35min 45s, total: 7h 15min 29s
Wall time: 3h 52s


In [37]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_3 = Sequential()
model_testing_3.add(Dense(64, activation='relu', input_dim=100000))
model_testing_3.add(Dense(1, activation='sigmoid'))
model_testing_3.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_3.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=2, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/2
Epoch 2/2
CPU times: user 5h 37min 39s, sys: 1h 34min 58s, total: 7h 12min 37s
Wall time: 3h 4s


In [38]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_4 = Sequential()
model_testing_4.add(Dense(64, activation='relu', input_dim=100000))
model_testing_4.add(Dense(1, activation='sigmoid'))
model_testing_4.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_4.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=2, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/2
Epoch 2/2
CPU times: user 5h 33min 31s, sys: 1h 35min 4s, total: 7h 8min 35s
Wall time: 2h 59min 33s


In [40]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_5 = Sequential()
model_testing_5.add(Dense(64, activation='relu', input_dim=100000))
model_testing_5.add(Dense(1, activation='sigmoid'))
model_testing_5.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_5.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=5, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 13h 53min 56s, sys: 3h 49min 39s, total: 17h 43min 36s
Wall time: 7h 8min 10s


Having tried four different learning rates (0.0005, 0.005, 0.01, 0.1), none of them outperformed the default learning rate of 0.001.

Maybe I can try to increase the number of hidden nodes, and see how it affects the performance. Below model has 128 nodes in the hidden layer.

In [27]:
%%time
model_s_2 = Sequential()
model_s_2.add(Dense(128, activation='relu', input_dim=100000))
model_s_2.add(Dense(1, activation='sigmoid'))
model_s_2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_train, 32),
                    epochs=2, validation_data=(x_validation_tfidf, y_validation),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/2
Epoch 2/2
CPU times: user 12h 23min 45s, sys: 2h 56min 41s, total: 15h 20min 27s
Wall time: 5h 9min 30s


With 128 hidden nodes, validation accuracy got close to the performance of logistic regression. I could experiment further with increasing the number of hidden layers, but for the above 2 epochs to run, it took 5 hours. Considering that logistic regression took less than a minute to fit, even if the neural network can be improved further, this doesn't look like an efficient way.

Below is a table with all the results I got from trying different models above. Please note that I have compared performance at 2 epochs since some of the models only ran for 2 epochs.

| model | learning rate | input layer (nodes) | data shuffling | hidden layer (nodes) | dropout | output layer (nodes) | training accuracy | validation accuracy |
|-----|------|---|-----|----|----|----|----|
| ANN_1 | 0.001 | 1 (100,000)  | X | 1 (64) relu  |  X  | 1 (1) sigmoid   | 83.52% | 82.54% |
| ANN_2 | 0.001 | 1 (100,000)  | X | 1 (64) relu  |  0.2  | 1 (1) sigmoid   | 83.35% | 82.56% |
| ANN_3 | 0.001 | 1 (100,000)  | O | 1 (64) relu  |  X  | 1 (1) sigmoid   | 83.52% | 82.76% |
| ANN_4 | 0.001 | 1 (100,000)  | O | 1 (64) relu  |  0.2  | 1 (1) sigmoid   | 83.37% | 82.64%  |
| ANN_5 | 0.0005 | 1 (100,000)  | O | 1 (64) relu  |  X  | 1 (1) sigmoid   | 83.52% | 82.61%  |
| ANN_6 | 0.005 | 1 (100,000)  | O | 1 (64) relu  |  X  | 1 (1) sigmoid   | 83.52% | 82.59%  |
| ANN_7 | 0.01 | 1 (100,000)  | O | 1 (64) relu  |  X  | 1 (1) sigmoid   | 83.43% | 82.61%  |
| ANN_8 | 0.1 | 1 (100,000)  | O | 1 (64) relu  |  X  | 1 (1) sigmoid   | 77.48% | 72.94%  |
| ANN_9 | 0.001 | 1 (100,000)  | O | 1 (128) relu  |  X  | 1 (1) sigmoid   | 83.54% | 82.84%  |

Except for ANN_8 (with a learning rate of 0.1), the model performance only varies in the decimal place, and the best model is ANN_9 (with one hidden layer of 128 nodes) at 82.84% validation accuracy.

As a result, in this particular case, neural network models failed to outperform logistic regression. This might be due to the high dimensionality and sparse characteristics of the textual data. I have also found a research paper, which compared model performance with high dimension data. According to "An Empirical Evaluation of Supervised Learning in High Dimensions" by Caruana et al.(2008), logistic regression showed as good performance as neural networks, in some cases outperforms neural networks. http://icml2008.cs.helsinki.fi/papers/632.pdf

Through all the trials above I learned some valuable lessons. Implementing and tuning neural networks is a highly iterative process and includes many trials and errors. Even though the neural network is a more complex version of logistic regression, it doesn't always outperform logistic regression, and sometimes with high dimension sparse data, logistic regression can deliver good performance with much less computation time than neural network.

In the next post, I will implement a neural network with Doc2Vec vectors I got from the previous post. Hopefully, with dense vectors such as Doc2Vec, a neural network might show some boost. Fingers crossed.