# SSWE
Sentiment Specific Word Embedding for twitter sentiment classification

In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers
from keras.layers import Dense

Using TensorFlow backend.


### Loading the data

 .csv file

In [2]:
tweets_dir = "twitter-airline-sentiment/"
df = pd.read_csv('Tweets.csv')

In [3]:
df.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


###  Data processing
As per the architecture mentioned in the paper, Only two outputs have been shown. Therefore, my aim is to dropp the 'Neutral' sentiment from the data frame and only differentiated between positive and negative tweets. After that, I am filtering the tweets so only valid texts and words remain.

In [4]:
df = pd.read_csv('Tweets.csv', index_col=1)

In [5]:
data=df.drop('neutral')

In [6]:
data.head(2)

Unnamed: 0_level_0,tweet_id,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
positive,570301130888122368,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
negative,570301031407624196,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)


In [7]:
data.head(2)

Unnamed: 0_level_0,tweet_id,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
positive,570301130888122368,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
negative,570301031407624196,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)


In [8]:
df_ri = data.reset_index()

In [9]:
df_ri

Unnamed: 0,airline_sentiment,tweet_id,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,positive,570301130888122368,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
1,negative,570301031407624196,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
2,negative,570300817074462722,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
3,negative,570300767074181121,1.0000,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
4,positive,570300616901320704,0.6745,,0.0000,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
5,positive,570299953286942721,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
6,positive,570295459631263746,1.0000,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)
7,positive,570289724453216256,1.0000,,,Virgin America,,HyperCamiLax,,0,@VirginAmerica I &lt;3 pretty graphics. so muc...,,2015-02-24 10:30:40 -0800,NYC,America/New_York
8,positive,570289584061480960,1.0000,,,Virgin America,,HyperCamiLax,,0,@VirginAmerica This is such a great deal! Alre...,,2015-02-24 10:30:06 -0800,NYC,America/New_York
9,positive,570287408438120448,0.6451,,,Virgin America,,mollanderson,,0,@VirginAmerica @virginmedia I'm flying your #f...,,2015-02-24 10:21:28 -0800,,Eastern Time (US & Canada)


#### Keeping only the neccessary columns

In [10]:
df = df_ri[['text','airline_sentiment']]

#### Tokenization
Word Tokenization is the most commonly used tokenization algorothm. It splits a piece of text into individual words based on a certain delimiter. Depending upon delimiters, different word-level tokens are formed.

In [11]:
from keras.preprocessing.text import Tokenizer

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.2, random_state=42)
print('Train data number of samples:', X_train.shape[0])
print('Test data number of samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

Train data number of samples: 9232
Test data number of samples: 2309


####  Numword: 
the maximum number of words to keep, based on the word frequency

In [13]:
nword = 10000 

In [14]:
tk = Tokenizer(num_words=nword,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
tk.fit_on_texts(X_train)
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

##### Make all the documents in same length 
For that purpose, padding method have been used

In [15]:
maxlen = 30

In [16]:
X_train_seqt = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_seqt = pad_sequences(X_test_seq, maxlen=maxlen)

In [17]:
X_test_seqt
X_test_seqt

array([[   0,    0,    0, ...,  988,    9,   17],
       [   0,    0,    0, ...,   51,   47,   49],
       [   0,    0,    0, ...,    1, 6449,  528],
       ...,
       [   0,    0,    0, ...,   98,    2,  313],
       [   0,    0,    0, ...,   90, 1541,   90],
       [   0,    0,    0, ...,  199,  105,   29]])

####  Label Encoding
Label encoding is simply converting the label to machine readable form, such as 'the' to 3

In [18]:
labelEn = LabelEncoder()
y_train_labelEn = labelEn.fit_transform(y_train)
y_test_labelEn = labelEn.transform(y_test)
y_train_Cat = to_categorical(y_train_labelEn)
y_test_Cat = to_categorical(y_test_labelEn)

In [19]:
print (y_test_Cat)

[[1. 0.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


####  Data Spitting
Split the dataset for training and testing

In [20]:
X_train_final, X_valid_final, y_train_final, y_valid_final = train_test_split(X_train_seqt, y_train_Cat, test_size=0.1, random_state=37)

assert X_valid_final.shape[0] == y_valid_final.shape[0]
assert X_train_final.shape[0] == y_train_final.shape[0]

print('Shape of validation set:',X_valid_final.shape)

Shape of validation set: (924, 30)


### Modelling

In [21]:
model = models.Sequential()
model.add(layers.Embedding(nword, 8, input_length=maxlen))
model.add(layers.Flatten())
#model.add(LSTM(units=50, name='sentiment_analysis_LSTM'))
model.add(layers.Dense(30, activation='linear'))
model.add(layers.Dense(20, activation='tanh'))
model.add(layers.Dense(10, activation='linear'))
model.add(layers.Dense(2, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                7230      
_________________________________________________________________
dense_2 (Dense)              (None, 20)                620       
_________________________________________________________________
dense_3 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 22        
Total params: 88,082
Trainable params: 88,082
Non-trainable params: 0
__________________________________________________

###  Model Compilation

In [22]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy'
                  , metrics=['accuracy'])

### Taining and Validation

In [23]:
print('Training')
batch_size = 24
model.fit(X_train_final, y_train_final, epochs = 10, batch_size=batch_size, verbose = 2, validation_data=(X_valid_final,y_valid_final))

Training

Train on 8308 samples, validate on 924 samples
Epoch 1/10
 - 1s - loss: 0.3665 - accuracy: 0.8473 - val_loss: 0.2454 - val_accuracy: 0.9048
Epoch 2/10
 - 1s - loss: 0.1923 - accuracy: 0.9251 - val_loss: 0.2126 - val_accuracy: 0.9221
Epoch 3/10
 - 1s - loss: 0.1346 - accuracy: 0.9492 - val_loss: 0.2149 - val_accuracy: 0.9199
Epoch 4/10
 - 1s - loss: 0.0962 - accuracy: 0.9647 - val_loss: 0.2419 - val_accuracy: 0.9134
Epoch 5/10
 - 1s - loss: 0.0673 - accuracy: 0.9773 - val_loss: 0.2953 - val_accuracy: 0.8939
Epoch 6/10
 - 1s - loss: 0.0443 - accuracy: 0.9853 - val_loss: 0.3592 - val_accuracy: 0.8950
Epoch 7/10
 - 1s - loss: 0.0302 - accuracy: 0.9905 - val_loss: 0.4390 - val_accuracy: 0.8929
Epoch 8/10
 - 1s - loss: 0.0201 - accuracy: 0.9934 - val_loss: 0.5647 - val_accuracy: 0.8907
Epoch 9/10
 - 1s - loss: 0.0157 - accuracy: 0.9954 - val_loss: 0.5601 - val_accuracy: 0.8939
Epoch 10/10
 - 1s - loss: 0.0107 - accuracy: 0.9970 - val_loss: 0.7029 - val_accuracy: 0.8820


<keras.callbacks.callbacks.History at 0x1546f451a58>

###  Calculate Test score and accuracy

In [24]:
score, acc = model.evaluate(X_valid_final, y_valid_final, batch_size=batch_size)
print(f'Test Score={score}')
print(f'Test accuracy={acc}')

Test Score=0.7029267105745611
Test accuracy=0.8820346593856812


#### Conclusion 

Loss function reduces and accuracy increases in training datasets. To reduce the data validation loss of the model, hyperparamets need to improve. Decrement in Val_Loss  increses the test accuracy.