In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils import shuffle
from textblob import TextBlob
import subprocess

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
!apt install translate-shell

Reading package lists... Done
Building dependency tree       
Reading state information... Done
translate-shell is already the newest version (0.9.6.6-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [3]:
PATH="/content/drive/MyDrive/SEM2/IR/Sentiment_Analysis"

# Reading Dataset

In [4]:
isear=pd.read_csv(f"{PATH}/ISEAR.csv",header=None)
isear=isear.drop(labels=[2],axis=1).set_index(0)
isear

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
joy,On days when I feel close to my partner and ot...
fear,Every time I imagine that someone I love or I ...
anger,When I had been obviously unjustly treated and...
sadness,When I think about the short time that we live...
disgust,At a gathering I found myself involuntarily si...
...,...
shame,Two years back someone invited me to be the tu...
shame,I had taken the responsibility to do something...
fear,I was at home and I heard a loud sound of spit...
guilt,I did not do the homework that the teacher had...


In [5]:
np.unique(isear.index,return_counts=True)

(array(['anger', 'disgust', 'fear', 'guilt', 'guit', 'joy', 'sadness',
        'shame'], dtype=object),
 array([1079, 1066, 1076, 1049,    1, 1092, 1082, 1071]))

In [6]:
df=isear.loc[['anger','disgust','fear','sadness','joy']]
df

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
anger,When I had been obviously unjustly treated and...
anger,When a car is overtaking another and I am forc...
anger,When one is unjustly accused of something one ...
anger,When my partner was attacked and lost three te...
anger,Unjust accusations directed at me and my way o...
...,...
joy,When I received a letter from the university t...
joy,"I had a picnic with old classmates, we chatted..."
joy,It was the first time that I gave a birthday p...
joy,"In August,1983, the long awaited ""big envelope..."


In [7]:
lexi=pd.read_csv(f"{PATH}/NRC_emotion_lexicon_list.txt",delimiter="\t",header=None)
lexi=lexi.dropna()

In [9]:
lexiFilt=lexi
lexiFilt=lexiFilt.set_index(2)
lexiFilt=lexiFilt.loc[1]
lexiFilt=lexiFilt.set_index(1)
lexiFilt[1]=lexiFilt[0]
lexiFilt=lexiFilt.drop(0,axis=1)

In [10]:
Emotions=['anger', 'disgust', 'fear', 'joy', 'sadness']
for i in Emotions:
    df=df.append(lexiFilt.loc[i])

In [11]:
df=df.sort_index()
df

Unnamed: 0,1
anger,When I had been obviously unjustly treated and...
anger,dupe
anger,duplicity
anger,duress
anger,dying
...,...
sadness,"I couldn't sleep, and began to remind of my ch..."
sadness,I felt sad when I was despised by another person.
sadness,Some colleagues decided to go out on a Saturda...
sadness,During an informal talk with a friend when we ...


# Preprocessing

In [12]:
stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
punkts='''"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'''

In [13]:
def CorFilt(i):
    ps = PorterStemmer()
    
    buff=word_tokenize(i.lower().replace("\n","").replace("  "," ").replace("n't"," not"))
    buff2=""
    for j in pos_tag(buff):
        if j[-1]=='RB' and j[0]!="not":
            pass
        else:
            buff2+=j[0]+" "
    buff2=buff2.replace("not ","NOT")
    buff=word_tokenize(buff2.strip())
    ans=""
    for j in buff:
        if (j not in punkts) and (j not in stopwords):
            if j=="!":
                ans+=" XXEXCLMARK"
            elif j=="?":
                ans+=" XXQUESMARK"
            else:
                if j!="'s" and j!="``":
                    ans+=" "+ps.stem(j)
    return ans.strip()

In [14]:
def TFIDF(df):
    Corpus=[]
    for i in df[1]:
        Corpus.append(CorFilt(i))

    vectorizer = TfidfVectorizer()
    vectorizer.fit(Corpus)
    X=vectorizer.transform(Corpus).toarray()
    
    return X,vectorizer

In [15]:
def EmoVekt(df):
    X,vectorizer=TFIDF(df)
    counterS=0
    counterE=0
    emocount=0
    EmoVec=np.zeros((5,X.shape[-1]))
    for i in np.unique(df.index):
        counterE+=df.loc[i].shape[0]
        EmoVec[emocount]=np.mean(X[counterS:counterE,:], axis=0)
#         print(i,X[counterS:counterE,:].shape)
        emocount+=1
        counterS=counterE
    return EmoVec,vectorizer

In [16]:
EmoVec,vectorizer=EmoVekt(df)

In [17]:
def EmowavE(sent,vectorizer=vectorizer,EmoVec=EmoVec,trans=True):
    
    transDict={'gu':'Gujarati',
               'hi':'Hindi'}
    # Translate from any language to english
    if trans:
        analysis = TextBlob(sent)
        if analysis.detect_language()!='en':
            try:
                print(f"\nInput text was in {transDict[analysis.detect_language()]}")
            except:
                print(f"\nInput text was not in English")
            print("\nTranslating...")
            output=subprocess.check_output(['trans','-b',sent])
            sent=output.decode('utf-8').strip()
            print(f"\nTranslation in English: {sent}")
        
    EmoBuff=vectorizer.transform([CorFilt(sent)])
    EmoDict={0:'anger',
             1:'disgust',
             2:'fear',
             3:'joy',
             4:'sadness'}
    return EmoDict[np.argmax([float(cosine_similarity(EmoBuff.reshape(-1,1).T,EmoVec[i].reshape(-1,1).T)) for i in range(EmoVec.shape[0])])]

# Reading Test Data

In [19]:
import xml.etree.ElementTree as ET
tree=ET.parse(f"{PATH}/semeval/semeval.trial/affectivetext_trial.xml")
root = tree.getroot()

In [20]:
for child in root:
    print(child.attrib['id'],child.text)

1 Mortar assault leaves at least 18 dead
2 Goal delight for Sheva
3 Nigeria hostage feared dead is freed
4 Bombers kill shoppers
5 Vegetables, not fruit, slow brain decline
6 PM: Havana deal a good experiment
7 Kate is marrying Doherty
8 NASA revisiting life on Mars question
9 Happy birthday, iPod
10 Alonso would be happy to retire with three titles
11 Madonna's New Tot 'Happy at Home' in London
12 Nicole Kidman asks dad to help stop husband's drinking
13 United Finds Good Connection in Win
14 'Runway': Making Good Without Making Nice
15 We were 'arrogant and stupid' over Iraq, says US diplomat
16 Bad reasons to be good
17 Madonna's new baby's daddy didn't realize adoption was `for good'
18 Women in their 60s 'are perfectly good mothers'
19 We're a pretty kind 'bully'
20 Moderate drinking reduces men's heart attack risk
21 Tom Cruise and Katie Holmes set wedding date
22 Kidnapped AP photographer freed in Gaza
23 Bush Insists Troops Stay in Iraq, Predicts Midterm Victory
24 Hurricane Pa

In [21]:
child.attrib['id']

'253'

In [22]:
semeval=pd.read_csv(f"{PATH}/semeval/semeval.trial/affectivetext_trial.emotions.gold",delimiter=" ",header=None)
semeval=semeval.set_index(0)
semeval=semeval.drop(labels=6,axis=1)
semeval

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,22,2,60,0,64
2,0,0,0,93,0
3,18,0,52,66,20
4,66,39,94,0,86
5,0,0,25,26,2
...,...,...,...,...,...
249,33,33,42,0,44
250,0,0,0,89,0
251,0,0,0,48,0
252,33,43,61,0,33


In [23]:
EmoDict={0:'anger',
         1:'disgust',
         2:'fear',
         3:'joy',
         4:'sadness'}
semList=[]
for i in semeval.to_numpy():
    semList.append(EmoDict[np.argmax(i)])

In [24]:
semData=[]
for child in root:
    semData.append(child.text)

In [25]:
semdf=pd.DataFrame(semList)
semdf[1]=semData

#Including valence
semval=pd.read_csv(f"{PATH}/semeval/semeval.trial/affectivetext_trial.valence.gold",delimiter=" ",header=None)
semdf[2]=semval[1]
semdf=semdf.set_index(0)
semdf=semdf.sort_index()

# Training LSTM

- Creating input matrix

In [40]:
Corpus=[]
for i in df[1]:
    Corpus.append(CorFilt(i))
X=vectorizer.transform(Corpus)
X=X.toarray().reshape(X.shape[0],1,X.shape[-1])

In [41]:
X.shape

(11056, 1, 7309)

- Creating output matrix

In [42]:
from sklearn.preprocessing import OneHotEncoder

enc=OneHotEncoder(sparse=False)
enc.fit(np.array(df.index).reshape(-1,1))
y=enc.transform(np.array(df.index).reshape(-1,1))

In [43]:
y.shape

(11056, 5)

In [44]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization#, CuDNNLSTM

model = Sequential()

model.add(LSTM(128, input_shape=(X.shape[1:]),return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(64, input_shape=(X.shape[1:])))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(y.shape[1], activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001)

# Compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

model.fit(X,
          y,
          epochs=50)
# model.save(PATH+"/models/")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fb7137fab50>

In [50]:
model.save(PATH+"/models/")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/SEM2/IR/Sentiment_Analysis/models/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/SEM2/IR/Sentiment_Analysis/models/assets


In [51]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 1, 128)            3808256   
_________________________________________________________________
dropout (Dropout)            (None, 1, 128)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                

# Testing

In [33]:
semdf

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
anger,"Sony Hates Europeans, Will Prevent The Importi...",-19
anger,Budapest calm after night of violent protests,-12
anger,Genghis Khan Beer? Mongolia Grimaces,0
anger,Managua Journal: Hold the Mojito and Margarita...,0
anger,Lebo: A life lived on the edge,0
...,...,...
sadness,Marine killed in fighting west of Baghdad,-58
sadness,"New Indonesia Calamity, a Mud Bath, Is Man-Made",-56
sadness,Gunman 'fine' before shooting,-26
sadness,Deaths linked to flu vaccine,-52


In [45]:
def EmopreD(sent,model=model,vectorizer=vectorizer):
    EmoDict={0:'anger',
        1:'disgust',
        2:'fear',
        3:'joy',
        4:'sadness'}
        
    buff=vectorizer.transform([CorFilt(sent)]).toarray()
    return EmoDict[np.argmax(model.predict(buff.reshape(1,1,buff.shape[1])))]

In [46]:
buff=vectorizer.transform([CorFilt(semdf[1][0])]).toarray()
EmoDict[np.argmax(model.predict(buff.reshape(1,1,buff.shape[1])))]

'fear'

In [47]:
def Tester(df,model=model):
    y_pred=[]
    for i in range(len(df)):
        y_pred.append(EmopreD(df[1][i]))
    print(classification_report(df.index,y_pred))

In [48]:
Tester(semdf)

              precision    recall  f1-score   support

       anger       0.24      0.30      0.27        37
     disgust       0.05      0.06      0.05        16
        fear       0.31      0.53      0.39        38
         joy       0.68      0.47      0.56        95
     sadness       0.40      0.33      0.36        64

    accuracy                           0.39       250
   macro avg       0.34      0.34      0.33       250
weighted avg       0.45      0.39      0.41       250



- The accuracy is higher than what literature could achieve with VSM model in this case.

- Testing with a complex statement

In [55]:
EmopreD("a good perfectly parceled dead body")

'anger'