In [2]:
import tensorflow
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('data\\tweets_all.csv')

#Split into three df's based on topic
df_politics = df.loc[df['topic'].isin(["Sinn Fein", "Qanon", "Varadkar"])]
df_TV = df.loc[df['topic'].isin(["Eastenders", "Tommy Tiernan", "Eoghan McDermott"])]
df_others = df.loc[df['topic'].isin(["Pancakes", "Burren", "Daniel Kinahan", "Shamrock Rovers"])]

#Get dummy values for my sentiment scoring
y = pd.get_dummies(df['my_sentiment_for_vader']).values


In [4]:

#Taken from https://medium.datadriveninvestor.com/deep-learning-lstm-for-sentiment-analysis-in-tensorflow-with-keras-api-92e62cde7626
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tweet = df.clean_text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)




#Again taken from medium website
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding

embedding_vector_length = 32
model = Sequential() 
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           92128     
                                                                 
 spatial_dropout1d (SpatialD  (None, 200, 32)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 3)                 153       
                                                                 
Total params: 108,881
Trainable params: 108,881
Non-trainable params: 0
__________________________________________________

In [5]:
history = model.fit(padded_sequence, y, validation_split=0.2, epochs=10, verbose=1, batch_size=32)

hist = pd.DataFrame(history.history)

hist

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.68665,0.423077,0.674559,0.435294
1,0.651942,0.461538,0.63572,0.435294
2,0.620588,0.455621,0.614684,0.435294
3,0.61439,0.464497,0.606239,0.435294
4,0.603204,0.464497,0.601829,0.435294
5,0.590001,0.470414,0.607102,0.447059
6,0.564511,0.538462,0.624019,0.470588
7,0.531381,0.597633,0.641083,0.458824
8,0.488861,0.650888,0.643153,0.482353
9,0.426459,0.689349,0.669922,0.482353


In [12]:
#Repeat the process for each individual topic...
#For TV...

#Get dummy values for my sentiment scoring
y_TV = pd.get_dummies(df_TV['my_sentiment_for_vader']).values

In [13]:
tweet = df_TV.clean_text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)


embedding_vector_length = 32
model = Sequential() 
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 200, 32)           27328     
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_2 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 3)                 153       
                                                                 
Total params: 44,081
Trainable params: 44,081
Non-trainable params: 0
__________________________________________________

In [14]:
history = model.fit(padded_sequence, y_TV, validation_split=0.2, epochs=10, verbose=1, batch_size=32)

hist = pd.DataFrame(history.history)

hist

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.691989,0.382022,0.692983,0.130435
1,0.685871,0.595506,0.694908,0.043478
2,0.679189,0.58427,0.697774,0.043478
3,0.667729,0.58427,0.703261,0.043478
4,0.651363,0.58427,0.717946,0.043478
5,0.621832,0.58427,0.799698,0.043478
6,0.567737,0.58427,1.076039,0.043478
7,0.568515,0.58427,1.081932,0.043478
8,0.586852,0.58427,0.998098,0.043478
9,0.553528,0.58427,0.939293,0.043478


In [15]:
#For politics...

#Get dummy values for my sentiment scoring
y_politics = pd.get_dummies(df_politics['my_sentiment_for_vader']).values

In [16]:
tweet = df_politics.clean_text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)


embedding_vector_length = 32
model = Sequential() 
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 32)           42880     
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_3 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 3)                 153       
                                                                 
Total params: 59,633
Trainable params: 59,633
Non-trainable params: 0
__________________________________________________

In [17]:
history = model.fit(padded_sequence, y_politics, validation_split=0.2, epochs=10, verbose=1, batch_size=32)

hist = pd.DataFrame(history.history)

hist

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.691181,0.427184,0.686387,0.692308
1,0.683416,0.669903,0.677789,0.730769
2,0.674489,0.640777,0.665253,0.730769
3,0.660503,0.640777,0.642736,0.730769
4,0.628989,0.640777,0.58474,0.730769
5,0.565139,0.650485,0.510037,0.730769
6,0.510698,0.640777,0.492458,0.730769
7,0.510351,0.640777,0.474105,0.730769
8,0.521056,0.640777,0.471523,0.730769
9,0.488331,0.640777,0.476831,0.730769


In [18]:
#For others...

#Get dummy values for my sentiment scoring
y_others = pd.get_dummies(df_others['my_sentiment_for_vader']).values

In [19]:
tweet = df_others.clean_text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)


embedding_vector_length = 32
model = Sequential() 
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])  
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 32)           45088     
                                                                 
 spatial_dropout1d_4 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_4 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_4 (Dense)             (None, 3)                 153       
                                                                 
Total params: 61,841
Trainable params: 61,841
Non-trainable params: 0
__________________________________________________

In [20]:
history = model.fit(padded_sequence, y_others, validation_split=0.2, epochs=10, verbose=1, batch_size=32)

hist = pd.DataFrame(history.history)

hist

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.691969,0.337931,0.683885,0.540541
1,0.68458,0.372414,0.670164,0.648649
2,0.672585,0.427586,0.639668,0.702703
3,0.643423,0.406897,0.533631,0.702703
4,0.624833,0.372414,0.525643,0.702703
5,0.625288,0.413793,0.554357,0.297297
6,0.620676,0.372414,0.574987,0.297297
7,0.612644,0.462069,0.583025,0.297297
8,0.615378,0.462069,0.575662,0.297297
9,0.614933,0.37931,0.572942,0.297297
