# Mood Words Dataset

In [19]:
#import libraries and modules
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
import nltk
import sys

In [20]:
moodWords = pd.read_csv('moodWords.csv') #reads csv file of mood words
moodWords = moodWords.astype(str)
moodWords = moodWords.rename(columns=lambda x: x.lower().replace(' ', '_')) #standardizes column names
moodWords = moodWords.map(lambda x: x.lower()) #lowercase all data
#moodWords #prints dataset of mood words

In [21]:
X = moodWords.sentence #input variables
#X

In [22]:
y = moodWords.label #output variables
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Preprocess Text

In [23]:
#import libraries and modules
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [24]:
#initialize stopwords
stop_words = set(stopwords.words('english'))

#preprocess function
def preprocess(text):
    text = text.lower() #make the text lowercase
    text = ''.join([word for word in text if word not in string.punctuation]) #remove punctuations
    tokens = word_tokenize(text) #split individual words
    tokens = [word for word in tokens if word not in stop_words] #remove stop words
    return ' '.join(tokens) #join all words together

In [25]:
X = X.apply(preprocess) #preprocess text
#X

# Word2Vec Model

In [26]:
from gensim.models import Word2Vec

In [27]:
sentences = [sentence.split() for sentence in X] #splits sentences for every sentence in data
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=45, min_count=2, workers=5, sg=1) #trains Word2Vec model
#sentences

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


# CNN Model

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPool1D, Flatten, Dense
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt

In [29]:
#tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequence = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index

vocab_size = len(tokenizer.word_index) + 1

In [30]:
#pad the sequences to a fixed length
max_length = max(len(s) for s in X_sequence)
X_pad = pad_sequences(X_sequence, maxlen=max_length, padding='post')
#X_pad

In [31]:
#create a weight matrix for the embedding layer
embedding_dim=100
embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [32]:
#split data for training and testing (test size is 30%)
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=.30, random_state=42, stratify=y)

In [33]:
#define CNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(14, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=15, batch_size=35, validation_data=(X_test, y_test))
metrics_df = pd.DataFrame(history.history)

Epoch 1/15


In [None]:
plt.figure(figsize=(10,5))
plt.plot(metrics_df.index, metrics_df.loss)
plt.plot(metrics_df.index, metrics_df.val_loss)
plt.title('CNN Mood Words')
plt.xlabel('Epochs')
plt.ylabel('Categorical Crossentropy Loss')
plt.legend(['Training Loss', 'Validation Loss'])
plt.show()