In [1]:
import numpy as np
import tensorflow as tf
import keras
import os
import glob
from glob import glob
import pathlib
import csv
import pandas as pd
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization, Embedding, Conv1D, MaxPooling1D, Dense, Input, Dropout, GlobalMaxPooling1D
from keras.models import Model
from keras import regularizers

In [2]:
path = "/content/" # Using colab

In [3]:
# Import data (merging fine grained data .csv files(train, test, val) from link, 17 subreddits)
files = glob(os.path.join(path, "*.csv"))
dataframe = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

In [4]:
dataframe.shape

(25500, 2)

In [5]:
dataframe = dataframe.sort_values(by=['label'])

In [6]:
dataframe.head()

Unnamed: 0,label,text
15492,AskReddit,How much do you like or dislike girls who alwa...
12205,AskReddit,What's a 10/10 album from the last 15 years by...
20637,AskReddit,Whats the most racist joke you know?
1401,AskReddit,I find gauges to be unattractive and stupid. R...
12190,AskReddit,"Anyone who won a ""lifetime supply"" of somethin..."


In [7]:
c = -1
label_names = []
labels = []
features = []
for index, row in dataframe.iterrows():
  if row['label'] not in label_names:
    label_names.append(row['label'])
    c+=1
  labels.append(c)
  features.append(row['text'])
print("There are %d samples" % (len(features)))

There are 25500 samples


In [8]:
print("And there are %d different subreddits" % len(label_names))

And there are 17 different subreddits


In [9]:
seed = 1219
rng = np.random.RandomState(seed)
rng.shuffle(features)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

validation_split = 0.2
num_test_features = int(validation_split * len(features))
train_x = features[:-num_test_features]
test_x = features[-num_test_features:]
train_y = labels[:-num_test_features]
test_y = labels[-num_test_features:]

In [10]:
print(len(test_x))
print(len(test_y))

5100
5100


In [11]:
vectorizer = TextVectorization(max_tokens=25000, output_sequence_length=150)
text_ds = tf.data.Dataset.from_tensor_slices(train_x).batch(128)
vectorizer.adapt(text_ds)

In [12]:
vocabulary = vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [13]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2023-03-21 03:01:34--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-03-21 03:01:35--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-03-21 03:01:35--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [14]:
# Code from Keras
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)
a = 0
embeddings_index = {}
with open(path_to_glove_file) as f:
  for line in f:
    if(a<10):
      print(line)
      a+=1
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062

, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158

In [15]:
# Code from Keras
num_tokens = len(vocabulary) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)(meaning that the words were not in the embedding matrix" % (hits, misses))

Converted 20328 words (4672 misses)(meaning that the words were not in the embedding matrix


In [16]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [27]:
# From Keras
input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(input)
X = layers.Conv1D(128, 5, activation="relu", kernel_regularizer=regularizers.l2(0.001))(embedded_sequences)
X = layers.MaxPooling1D(5)(X)
X = Dropout(0.4)(X)
X = layers.Conv1D(128, 5, activation="relu", kernel_regularizer=regularizers.l2(0.001))(X)
X = layers.GlobalMaxPooling1D()(X)
X = Dropout(0.4)(X)
X = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.001))(X)
X = layers.Dropout(0.4)(X)
output = layers.Dense(len(label_names), activation="softmax")(X)
model = keras.Model(input, output)
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         2500200   
                                                                 
 conv1d_6 (Conv1D)           (None, None, 128)         64128     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 dropout_3 (Dropout)         (None, None, 128)         0         
                                                                 
 conv1d_7 (Conv1D)           (None, None, 128)         82048     
                                                           

In [22]:
x_train = vectorizer(np.array([[s] for s in train_x])).numpy()
x_test = vectorizer(np.array([[s] for s in test_x])).numpy()

y_train = np.array(train_y).reshape(20400,1)
y_test = np.array(test_y).reshape(5100,1)

In [23]:
print(x_test.shape)

(5100, 150)


In [28]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
model.fit(x=x_train, y=y_train, batch_size=128, epochs=80, validation_data=(x_test, y_test))

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7fee9458dd00>

In [29]:
print(x_test.shape)
print(y_test.shape)

(5100, 150)
(5100, 1)


In [30]:
model.evaluate(x=x_test, y=y_test)



[1.5964751243591309, 0.6592156887054443]

In [35]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
fin_model = keras.Model(string_input, preds)

probabilities = fin_model.predict([["What famous person didn't deserve all the hate that they got?"]]) # Post taken directly from AskReddit

label_names[np.argmax(probabilities[0])]



'AskReddit'

In [32]:
print(label_names)

['AskReddit', 'Futurology', 'Jokes', 'Showerthoughts', 'WritingPrompts', 'askscience', 'bestof', 'explainlikeimfive', 'history', 'nosleep', 'personalfinance', 'politics', 'science', 'television', 'todayilearned', 'videos', 'worldnews']


In [None]:
# Done!