In [2]:
import json
import pandas as pd
import numpy as np

In [3]:
df = pd.read_json("nlp_train.json")

In [4]:
df = df.transpose()

In [5]:
import re
from nltk.corpus import stopwords
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z#+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['body'] = df['body'].apply(clean_text)
df['body'] = df['body'].str.replace('\d+', '')

In [6]:
df.head()

Unnamed: 0,body,subreddit,created_utc,author,link_id,parent_id,emotion,complete
0,answering question criticism individual referr...,worldnews,1584474347,lostdoty13,t3_fkb1jo,t1_fkrog8a,"{'anger': True, 'anticipation': False, 'disgus...",True
1,going start today discussion thread personal o...,CoronavirusCA,1583695737,ErikCavey,t3_ffhe1g,t3_ffhe1g,"{'anger': True, 'anticipation': True, 'disgust...",False
2,announcing self quarantined paints picture ma...,Coronavirus,1582324871,jimkurth81,t3_f76ab0,t1_fiaxs2j,"{'anger': True, 'anticipation': True, 'disgust...",False
3,likewise sorry offended actually immunocomprom...,China_Flu,1583140191,DickGrimes79,t3_fc8ar8,t1_fj9aaqt,"{'anger': True, 'anticipation': False, 'disgus...",False
4,people infected experience high fever cough sh...,worldnews,1583741759,Fresherty,t3_ffebea,t1_fjzd331,"{'anger': False, 'anticipation': False, 'disgu...",False


In [7]:
from collections import defaultdict
y = defaultdict(list)
for i, post in enumerate(df['emotion']):
    for emotion in post:
        if df['emotion'][i][emotion]:
            y[emotion].append(1)
        else:
            y[emotion].append(0)

In [8]:
labels = pd.DataFrame(y)

In [99]:
labels

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust,neutral
0,1,0,1,0,0,0,0,1,0,0,0,0
1,1,1,1,1,0,0,0,1,0,0,0,0
2,1,1,1,1,0,0,0,1,0,0,0,0
3,1,0,1,1,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1488,1,0,1,1,0,0,0,1,1,0,0,0
1489,1,0,1,1,0,0,0,0,1,0,0,0
1490,0,1,0,1,0,0,1,0,0,0,1,0
1491,0,1,0,1,0,0,0,0,1,0,0,0


In [100]:
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each post
MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['body'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 28327 unique tokens.


In [101]:
from keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(df['body'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data:', X.shape)

Shape of data: (1493, 250)


In [102]:
y = labels.values
print('Shape of labels:', y.shape)

Shape of labels: (1493, 12)


In [103]:
#subreddits = pd.get_dummies(df["subreddit"]).values

In [104]:
X = np.concatenate((X, subreddits), axis=1)

In [106]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 42)

In [108]:
from keras import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.losses import KLD

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(12, activation='sigmoid'))
model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

epochs = 25
batch_size = 64

history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 1208 samples, validate on 135 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [112]:
accr = model.evaluate(x_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.550
  Accuracy: 0.747


In [114]:
check = tokenizer.texts_to_sequences(["I love"])
check = pad_sequences(check, maxlen=MAX_SEQUENCE_LENGTH)

In [115]:
check.shape

(1, 250)

In [116]:
model.predict(check)

array([[0.01627871, 0.13510546, 0.03018257, 0.0596163 , 0.06014183,
        0.0330528 , 0.16225225, 0.04297176, 0.03085311, 0.03399587,
        0.11988373, 0.59112555]], dtype=float32)

In [96]:
y_test[100]

array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [80]:
y_anger = labels['disgust'].values

In [81]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y_anger, test_size = 0.10, random_state = 42)

In [60]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(C=0.1,random_state=0).fit(x_train, y_train)
lr_clf.score(x_test, y_test)

0.46

In [82]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = MinMaxScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [57]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth = 30, n_estimators = 200)
rf_clf.fit(X_train, y_train)
rf_clf.score(X_test, y_test)

0.5466666666666666

In [83]:
x_train

array([[0.03687913, 0.02323363, 0.01926585, ..., 0.        , 0.        ,
        0.        ],
       [0.26039224, 0.07427896, 0.0873993 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02568749, 0.03734931, 0.96747226, ..., 0.        , 0.        ,
        0.        ]])