#### Importing the Libraries

In [None]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_csv('IS733_dataset_emo_label_en.csv')

### LDA Method:

We have deleted stopwords before, now we are going to do tokenizing for LDA on df:

In [None]:
import spacy
spacy.load("en_core_web_sm")
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

We use NLTK’s Wordnet to find the meanings of words, synonyms, antonyms, and more. In addition, we use WordNetLemmatizer to get the root word.

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Open up our data, read line by line, for each line, prepare text for LDA, then add to a list.

Now we can see how our text data are converted:

In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
df_lda2 = df["Text"]

In [None]:
df_lda2.to_csv(r'IS733_dataset_LDA_input.csv')


In [None]:
import random
text_data = []
with open('IS733_dataset_LDA_input.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        text_data.append(tokens)
print(text_data)

[[], [], [], ['window'], ['fill', 'form'], ['last-', 'first', 'jones'], [], ['jones'], ['strange', 'figure', 'fill', 'wrong', 'section'], [], ["what's", 'apply'], ['apply', 'another'], ['another'], ['suppose', 'three', 'form', 'story', 'suppose', 'three', 'form', 'really', 'another'], [], ['right'], ['second'], ['really', 'simple', 'problem', 'maybe', 'should--'], [], ['waiting', 'another'], ['passport'], ['passport'], ['birth', 'certificate'], ['though', 'somewhere'], ['expect'], ['probably', 'another', 'birth', 'certificate'], ['probably'], ['helpful'], [], [], [], [], ['understand', 'driver', 'license', 'california'], ['breathing'], ['california'], [], ['form', 'already'], ['driver', 'license'], ['driver', 'license'], ['suppose', 'california'], ['fill', 'questions--'], ['raise', 'voice'], ['already', 'answer', 'question'], ['child'], [], ['working', 'live', 'california'], ['make', 'acting', 'child'], ['california', 'whole--'], ['maybe', 'never', 'd.m.v.', 'maybe', 'number', 'differe

In [None]:
text_data

[[],
 [],
 [],
 ['window'],
 ['fill', 'form'],
 ['last-', 'first', 'jones'],
 [],
 ['jones'],
 ['strange', 'figure', 'fill', 'wrong', 'section'],
 [],
 ["what's", 'apply'],
 ['apply', 'another'],
 ['another'],
 ['suppose',
  'three',
  'form',
  'story',
  'suppose',
  'three',
  'form',
  'really',
  'another'],
 [],
 ['right'],
 ['second'],
 ['really', 'simple', 'problem', 'maybe', 'should--'],
 [],
 ['waiting', 'another'],
 ['passport'],
 ['passport'],
 ['birth', 'certificate'],
 ['though', 'somewhere'],
 ['expect'],
 ['probably', 'another', 'birth', 'certificate'],
 ['probably'],
 ['helpful'],
 [],
 [],
 [],
 [],
 ['understand', 'driver', 'license', 'california'],
 ['breathing'],
 ['california'],
 [],
 ['form', 'already'],
 ['driver', 'license'],
 ['driver', 'license'],
 ['suppose', 'california'],
 ['fill', 'questions--'],
 ['raise', 'voice'],
 ['already', 'answer', 'question'],
 ['child'],
 [],
 ['working', 'live', 'california'],
 ['make', 'acting', 'child'],
 ['california', 'whol

In [None]:
df_lda2

0       Next.\n Yes, me. Okay, okay here we go.\n My ...
1       Hey, Joy come in here.\n What's happening?\n ...
2       The craziest thing just happened to me.\n Wha...
3       Check this out.  You know how I've told you I...
4       You seem kind of down.\n I'm just mad.\n How ...
                             ...                        
139     What is it?\n Sweetheart, I've got to tell yo...
140     Ryan, what's wrong?\n I just got some bad new...
141     Guess what?\n What?\n I got it.  I got accept...
142     Hi. Um-  My luggage didn't come out of the co...
143     Uh, God.  I don't know what to do anymore.  L...
Name: Text, Length: 144, dtype: object

First, we are creating a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use.

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.404*"sorry" + 0.026*"start" + 0.019*"excite" + 0.016*"child" + 0.015*"talking"')
(1, '0.056*"understand" + 0.043*"point" + 0.032*"great" + 0.032*"dollar" + 0.027*"try"')
(2, '0.068*"thing" + 0.065*"something" + 0.055*"would" + 0.050*"little" + 0.050*"never"')
(3, '0.121*"really" + 0.043*"annie" + 0.027*"stuff" + 0.019*"exactly" + 0.019*"would"')
(4, '0.147*"think" + 0.055*"anything" + 0.038*"still" + 0.030*"course" + 0.026*"would"')
(5, '0.051*"people" + 0.046*"suppose" + 0.041*"always" + 0.035*"happen" + 0.027*"flashlight"')
(6, '0.039*"years" + 0.038*"business" + 0.033*"happy" + 0.032*"three" + 0.025*"want"')
(7, '0.063*"could" + 0.057*"laughter" + 0.035*"remember" + 0.034*"believe" + 0.029*"things"')
(8, '0.182*"right" + 0.053*"maybe" + 0.023*"guess" + 0.022*"getting" + 0.018*"somebody"')
(9, '0.215*"going" + 0.026*"nothing" + 0.025*"thank" + 0.023*"happen" + 0.021*"alright"')


In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

### trying for doc-topic distribution using Gensim method

In [None]:
df_lda2

0       Next.\n Yes, me. Okay, okay here we go.\n My ...
1       Hey, Joy come in here.\n What's happening?\n ...
2       The craziest thing just happened to me.\n Wha...
3       Check this out.  You know how I've told you I...
4       You seem kind of down.\n I'm just mad.\n How ...
                             ...                        
139     What is it?\n Sweetheart, I've got to tell yo...
140     Ryan, what's wrong?\n I just got some bad new...
141     Guess what?\n What?\n I got it.  I got accept...
142     Hi. Um-  My luggage didn't come out of the co...
143     Uh, God.  I don't know what to do anymore.  L...
Name: Text, Length: 144, dtype: object

In [None]:
topic = []
for k in range(0,144):
    max_p = 0
    max_clsuter = 0
    list_temp = lda[corpus[k]]
    for i in range(0, len(lda[corpus[k]])):
                if list_temp[i][1] > max_p:
                    max_p = list_temp[i][1]
                    max_clsuter = list_temp[i][0]
    topic.append(max_clsuter)
print(len(topic))


144


In [None]:
df_lda2['topic'] = topic
df_lda2



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



0         Next.\n Yes, me. Okay, okay here we go.\n My ...
1         Hey, Joy come in here.\n What's happening?\n ...
2         The craziest thing just happened to me.\n Wha...
3         Check this out.  You know how I've told you I...
4         You seem kind of down.\n I'm just mad.\n How ...
                               ...                        
140       Ryan, what's wrong?\n I just got some bad new...
141       Guess what?\n What?\n I got it.  I got accept...
142       Hi. Um-  My luggage didn't come out of the co...
143       Uh, God.  I don't know what to do anymore.  L...
topic    [0, 0, 0, 3, 0, 9, 0, 5, 5, 0, 9, 9, 8, 0, 0, ...
Name: Text, Length: 145, dtype: object

In [None]:
len(lda[corpus[0]])

10

In [None]:
lda[corpus[2]]

[(0, 0.1),
 (1, 0.1),
 (2, 0.1),
 (3, 0.1),
 (4, 0.1),
 (5, 0.1),
 (6, 0.1),
 (7, 0.1),
 (8, 0.1),
 (9, 0.1)]

In [None]:
list1 = lda[corpus[0]]
type(lda[corpus[0]])

list

In [None]:
list1[1][1]

0.1

In [None]:
lda[corpus[5]]

[(0, 0.025000002),
 (1, 0.025000002),
 (2, 0.025000002),
 (3, 0.025000002),
 (4, 0.025000002),
 (5, 0.025009241),
 (6, 0.2750407),
 (7, 0.025005545),
 (8, 0.025000002),
 (9, 0.5249445)]

In [None]:
df_lda2.to_csv(r'IS733_dataset_LDA.csv')

In [None]:
df_lda2 = pd.read_csv("IS733_dataset_LDA.csv")

In [None]:
lda_feature=[]
for i in range(0,144):
  lda_feature.append(lda[corpus[i]])
lda_feature

[[(0, 0.1),
  (1, 0.1),
  (2, 0.1),
  (3, 0.1),
  (4, 0.1),
  (5, 0.1),
  (6, 0.1),
  (7, 0.1),
  (8, 0.1),
  (9, 0.1)],
 [(0, 0.1),
  (1, 0.1),
  (2, 0.1),
  (3, 0.1),
  (4, 0.1),
  (5, 0.1),
  (6, 0.1),
  (7, 0.1),
  (8, 0.1),
  (9, 0.1)],
 [(0, 0.1),
  (1, 0.1),
  (2, 0.1),
  (3, 0.1),
  (4, 0.1),
  (5, 0.1),
  (6, 0.1),
  (7, 0.1),
  (8, 0.1),
  (9, 0.1)],
 [(0, 0.05),
  (1, 0.05),
  (2, 0.05),
  (3, 0.55),
  (4, 0.05),
  (5, 0.05),
  (6, 0.05),
  (7, 0.05),
  (8, 0.05),
  (9, 0.05)],
 [(0, 0.36666664),
  (1, 0.03333333),
  (2, 0.03333333),
  (3, 0.03333333),
  (4, 0.03333333),
  (5, 0.36666667),
  (6, 0.03333333),
  (7, 0.03333333),
  (8, 0.03333333),
  (9, 0.03333333)],
 [(0, 0.025000002),
  (1, 0.025000002),
  (2, 0.025000002),
  (3, 0.025000002),
  (4, 0.025000002),
  (5, 0.025006324),
  (6, 0.27504057),
  (7, 0.025005544),
  (8, 0.025000002),
  (9, 0.5249475)],
 [(0, 0.1),
  (1, 0.1),
  (2, 0.1),
  (3, 0.1),
  (4, 0.1),
  (5, 0.1),
  (6, 0.1),
  (7, 0.1),
  (8, 0.1),
  (9, 0.1

In [None]:
lda_feature = pd.read_excel('LDAFeatures.xlsx', sheet_name='final dataset')

print(lda_feature)

     Unnamed: 0         0         1  ...         7         8         9
0             0  0.100000  0.100000  ...  0.100000  0.100000  0.100000
1             1  0.100000  0.100000  ...  0.100000  0.100000  0.100000
2             2  0.050001  0.050001  ...  0.050001  0.050001  0.050001
3             3  0.033334  0.699991  ...  0.033334  0.033334  0.033336
4             4  0.774739  0.025029  ...  0.025029  0.025029  0.025029
..          ...       ...       ...  ...       ...       ...       ...
138         138  0.025005  0.025001  ...  0.025001  0.025000  0.025004
139         139  0.033395  0.367398  ...  0.033395  0.033395  0.033395
140         140  0.280397  0.014292  ...  0.014292  0.172274  0.014292
141         141  0.100000  0.100000  ...  0.100000  0.100000  0.100000
142         142  0.033336  0.033336  ...  0.033336  0.033336  0.366688

[143 rows x 11 columns]


In [None]:
df = df.drop([0], axis=0)
df
df = df.drop([0], axis=1)
df

In [None]:
X = lda_feature
y = df["Label"]

In [None]:
X = X.iloc[:, 1:10]
X

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
1,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
2,0.050001,0.050001,0.549991,0.050001,0.050001,0.050001,0.050001,0.050001,0.050001
3,0.699991,0.033334,0.033334,0.033334,0.033334,0.033334,0.033334,0.033334,0.033336
4,0.025029,0.025029,0.025029,0.025033,0.025029,0.025029,0.025029,0.025029,0.025029
...,...,...,...,...,...,...,...,...,...
138,0.025001,0.524990,0.025000,0.274998,0.025000,0.025000,0.025001,0.025000,0.025004
139,0.367398,0.033395,0.365444,0.033395,0.033395,0.033395,0.033395,0.033395,0.033395
140,0.014292,0.161455,0.300114,0.014293,0.014298,0.014292,0.014292,0.172274,0.014292
141,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000


In [None]:


import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
import sklearn.metrics as metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [None]:
X_train

Unnamed: 0,1,2,3,4,5,6,7,8,9
28,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
101,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
140,0.014292,0.161455,0.300114,0.014293,0.014298,0.014292,0.014292,0.172274,0.014292
124,0.033334,0.033334,0.033334,0.699997,0.033334,0.033334,0.033334,0.033334,0.033334
68,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
...,...,...,...,...,...,...,...,...,...
111,0.025002,0.025002,0.275018,0.025002,0.274991,0.025002,0.274969,0.025010,0.025002
110,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
108,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.549999
129,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.549999


In [None]:
# function to print out classification model report
def classification_report(model_name, test, pred):
    from sklearn.metrics import precision_score, recall_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
    print("     Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
    print("        Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
    print("      F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")

In [None]:
rf = RandomForestClassifier(max_depth=1, random_state=0)
rf = rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
classification_report("Random forest Report (Training Set)", y_train, y_pred_train)
classification_report("Random forest Report (Validation Set)", y_val, y_pred_val)

Random forest Report (Training Set) :

Accuracy Score:  73.529 %
     Precision:  36.765 %
        Recall:  50.000 %
      F1 score:  42.373 %
Random forest Report (Validation Set) :

Accuracy Score:  58.333 %
     Precision:  29.167 %
        Recall:  50.000 %
      F1 score:  36.842 %



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf = rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
classification_report("Random forest Report (Training Set)", y_train, y_pred_train)
classification_report("Random forest Report (Validation Set)", y_val, y_pred_val)

Random forest Report (Training Set) :

Accuracy Score:  78.431 %
     Precision:  88.660 %
        Recall:  59.259 %
      F1 score:  59.230 %
Random forest Report (Validation Set) :

Accuracy Score:  58.333 %
     Precision:  29.167 %
        Recall:  50.000 %
      F1 score:  36.842 %



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



MAX_depth =1 is chosen

In [None]:
rf = RandomForestClassifier(max_depth=1, random_state=0)
rf = rf.fit(X_test, y_test)
y_pred_test = rf.predict(X_test)
classification_report("Random forest Report (test Set)", y_test, y_pred_test)


Random forest Report (test Set) :

Accuracy Score:  72.414 %
     Precision:  36.207 %
        Recall:  50.000 %
      F1 score:  42.000 %



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Visualizations

In [None]:
pip install pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
lda_display = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

The code above from : 
https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

The code above did not work! So I am trying another sample codes for LDA
from
https://www.kaggle.com/yashvi/topic-modelling-using-gensim-lda/notebook

### Evaluation for our LDA method:

Though arbitrary, after running LDA, the data is now 'labeled'. We use the topics as labels. This means that we now use supervised learning to see how well the our topic modeling generalizes. This is just one way to evaluate the topic modeling. If the model was able to find a meaningful split in the data, it should be possible to train a classifier to predict which topic a given instance should belong to. 

In [None]:
# function to print out classification model report
def classification_report(model_name, test, pred):
    from sklearn.metrics import precision_score, recall_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
    print("     Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
    print("        Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
    print("      F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")

In [None]:
df_lda.iloc[:,15]

In [None]:
y_pred = df_lda.iloc[:,15]

In [None]:
from sklearn.model_selection import train_test_split

# test set size of 20% of the data and the random seed 42 <3
X_train, X_test, y_train, y_test = train_test_split(X_reduced ,y_pred, test_size=0.2, random_state=42)

print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier

# SGD instance
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3, random_state=42, n_jobs=4)
# train SGD
sgd_clf.fit(X_train, y_train)

# cross validation predictions
sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)

# print out the classification report
classification_report("Stochastic Gradient Descent Report (Training Set)", y_train, sgd_pred)

**Emotions**