In [246]:
#importing the packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
import re
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [247]:
# Reading the data from the train.csv file

train_data = pd.read_csv("train.csv", index_col=None, header=0, engine='python' )

In [248]:
# displaying the data using head() function

train_data.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos


In [249]:
# Checking the description , datatypes and null values of the dataset

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2914 entries, 0 to 2913
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Tweet            2914 non-null   object
 1   Target           2914 non-null   object
 2   Stance           2914 non-null   object
 3   Opinion Towards  2914 non-null   object
 4   Sentiment        2914 non-null   object
dtypes: object(5)
memory usage: 114.0+ KB


In [250]:
# Checking all the nan values

train_data.isna().sum()

Tweet              0
Target             0
Stance             0
Opinion Towards    0
Sentiment          0
dtype: int64

In [251]:
## Combining the two columns of "Tweet" and "Target"

train_data["Tweet"] = train_data["Tweet"] + " " + train_data["Target"]

In [252]:
## displaying the 1st row after concatenation

train_data['Tweet'][0]

'@tedcruz And, #HandOverTheServer she wiped clean + 30k deleted emails, explains dereliction of duty/lies re #Benghazi,etc #tcot Hillary Clinton'

In [253]:
train_data['Stance'].value_counts()

AGAINST    1395
NONE        766
FAVOR       753
Name: Stance, dtype: int64

In [254]:
train_data["Stance"] = train_data["Stance"].astype('category')

In [255]:
train_data["Stance"] = train_data["Stance"].cat.codes

In [256]:
train_data['Stance']

0       0
1       1
2       0
3       0
4       2
       ..
2909    0
2910    0
2911    0
2912    0
2913    0
Name: Stance, Length: 2914, dtype: int8

In [257]:
type(train_data['Stance'])

pandas.core.series.Series

In [258]:
train_data['Stance'].value_counts()

0    1395
2     766
1     753
Name: Stance, dtype: int64

In [259]:
# Dividing the dataset into train and test set

X = train_data.drop(['Stance','Opinion Towards', 'Sentiment'], axis = 1)

In [260]:
y = train_data['Stance']

In [261]:
X.shape

(2914, 2)

In [262]:
y.shape

(2914,)

In [263]:
### Defining the vocabulary size

voc_size = 10000  ## Also have to Check with size 5000 

In [264]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [265]:
## Preprocessing the Data

## Porter Stemmer is used for stemming (e.g if there are two words "final", "finally" then after stemming it becomes "fina")

## We also have to try by using "Lamentation"

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
preprocessed_words = []

for i in range(0,len(X)):
    clean = re.sub('[^a-zA-Z]', ' ', X['Tweet'][i])
    clean = clean.lower()
    clean = clean.split()
    
    clean = [ps.stem(word) for word in clean if not word in stopwords.words('english')]  
    clean = ' '.join(clean)
    preprocessed_words.append(clean)

In [266]:
preprocessed_words[1]

'hillari best choic truli want continu progress nation ohio hillari clinton'

In [267]:
## One Hot representation 

onehot_repr = [one_hot(word,voc_size) for word in preprocessed_words]

## Displaying the one-hot representated values

# onehot_repr

In [268]:
## Embedding Representation

sentence_length = 30
embedded_doc = pad_sequences(onehot_repr, padding='pre', maxlen=sentence_length)

In [269]:
print(embedded_doc[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 5814 1373 6890  685 2395  356 4827 1458 6267
 5814 8397]


In [270]:
len(embedded_doc)

2914

In [279]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sentence_length))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 30, 40)            400000    
_________________________________________________________________
lstm_20 (LSTM)               (None, 100)               56400     
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 303       
Total params: 456,703
Trainable params: 456,703
Non-trainable params: 0
_________________________________________________________________
None


In [280]:
# from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional
# from tensorflow.keras.models import Sequential
# from keras.utils import to_categorical

# y_binary = to_categorical(y)

# embedding_dim = 100

# embedding_layer = Embedding(voc_size,
#                             embedding_dim,
#                             input_length=sentence_length,
#                             trainable=True)

# model_glove = Sequential()
# model_glove.add(embedding_layer)
# model_glove.add(LSTM(units=32,  dropout=0.2, recurrent_dropout=0.25))
# model_glove.add(Dense(7, activation='softmax'))

# model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

# print(model_glove.summary())

In [281]:

from keras.utils import to_categorical
y_binary = to_categorical(y)


import numpy as np
X_final = np.array(embedded_doc)
y_final = np.array(y_binary)




In [282]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [283]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Train on 1952 samples, validate on 962 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f64dfee5208>

In [239]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sentence_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [240]:
y_pred=model.predict_classes(X_test)

In [241]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[349, 102,   0],
       [214,  54,   0],
       [196,  47,   0]])

In [183]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.34407484407484407

In [38]:
train_data.dtypes

Tweet              object
Target             object
Stance             object
Opinion Towards    object
Sentiment          object
dtype: object

In [40]:
train_data.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,tedcruz and handovertheserver she wiped clean ...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,theview i think our country is ready for a fem...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,i just gave an unhealthy amount of my hardearn...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,portiaaboulger thank you for adding me to your...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos
