In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

In [3]:
MAX_SEQUENCE_LENGTH= 30
MAX_VOCAB_SIZE =20000
EMBEDDING_DIM=100
VALIDATION_SPLIT =0.2
BATCH_SIZE =128
EPOCHS =10

In [5]:
word2vec = {}
with open (os.path.join("glove.6B.100d.txt"), encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vec = np.asarray(values[1:], dtype="float32")
        word2vec[word] =vec    

Set up the Data

In [6]:
df = pd.read_csv("data/popular_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,askreddit,worldnews,funny,gaming,news,movies,tifu,mildlyinteresting,explainlikeimfive,...,todayilearned,jokes,aww,videos,lifeprotips,twoxchromosomes,oldschoolcool,art,dataisbeautiful,amitheasshole
0,0,Chris Rock is performing a standup gig tonight...,Zelenskyy says 'horrors' of Russian invasion c...,Will Smith arrives at the Oscars after party:,"A player in Elden Ring named ""Let me solo her""...",Jackson confirmed as first Black female high c...,"Hello, I’m Nicolas Cage and welcome to Ask Me ...",TIFU by thinking my son was having gay sex whe...,This pet shop don’t sell bunnies at Easter,ELI5: How do “hostile takeovers” work? Is ther...,...,TIL that after Lance Armstrong was stripped of...,We should have a TV show where illegal immigra...,When your cat trusts you so much that she brin...,Jim Carrey on Will Smith assaulting Chris Rock...,LPT If you're planning on visiting San Francis...,40% of teen pregnancies in 15 y/o girls involv...,Willem Dafoe 1981,"Cheers!, Me, Digital, 2022",[OC] I wondered why girls put their Snapchat i...,AITA for getting mad my artist hid their initi...
1,1,"What is your best insult, WITHOUT using curse ...",Russian warship Moskva has sunk - state media ...,"My wife, using her violin as weaponized sarcasm.",One of my favourite Easter Eggs in gaming,DC Police Find 5 Fetuses in Home of Anti-Abort...,"Gilbert Gottfried, Comedian and ‘Aladdin’ Star...",TIFU when my mother caught me eating my girlfr...,My dad wrote Isaac Asimov a question when he w...,ELI5: Why does the economy require to keep gro...,...,"TIL dogs ""play sneeze"" when they're feeling ex...",A lesbian named Linda went to the beach. She u...,Camping with Golden Retrievers,Johnny Depp’s reaction to the court finally ge...,LPT If you feel tired and want to sleep with k...,U.S. Congresswoman Marjorie Taylor Greene defi...,My grandma & grandpa in 1955,"Sis, get out the way I want to see the rainbow...",[OC] Which media organizations are trusted mor...,AITA for embarrassing my FIL after I repeatedl...
2,2,"People with ‘street smarts’, what is your most...",Canada to Ban Foreigners From Buying Homes as ...,I think a kid is stuck inside the body of this...,Time to grab the broom,Colorado now guarantees the right to abortion ...,WillSmith Banned from Attending Oscars Ceremon...,TIFU by showing my college friends a picture o...,The book I borrowed was last lended 41 years ago,ElI5 How can restaurants leave ketchup and mus...,...,TIL: Robin Williams autopsy revealed he had ab...,I am getting so sick of millennials and their ...,Leopard getting weighed,DJ Khaled’s disastrous Hot Ones episode. Quits...,LPT: wear earplugs to loud concert venues. Tin...,Spain makes it a crime for pro-lifers to harra...,Now I understand why teens look like 30 year o...,"me with pearl earrings, me, watercolour, 2020",Teachers Spend 3x More On Classroom Expenses T...,AITA for refusing to get on a flight?
3,3,"Old Redditors, what's a must-know story of Red...",Spain makes it a crime for pro-lifers to harra...,"Help, please!",DOOM on a pregnancy test,Alex Jones' Infowars files for bankruptcy in w...,we told nicolas cage that his AMA had more com...,TIFU by letting my gf suck my dick in front of...,My $1 inheritance check,ELI5: Why do wheelbarrows use only 1 wheel? Wo...,...,TIL the IRS still operates largely on mainfram...,"The IRS decides to audit Grandpa, and summons ...",Squirrel makes a home outside a window and the...,Johnny Depp Testifies On Why He's Suing Amber ...,LPT: If you’re stuck on a call you don’t want ...,If Jada Pinkett Smith thought that Chris Rock ...,Amy Lee of the band Evanescence going to schoo...,“Pearl with a ‘Girl with a pearl earring’ earr...,[OC] Comparing the GDP of US States with that ...,AITA for walking out of the babyshower my InLa...
4,4,Your toilet is now sentient. Would you prefer ...,NATO will deploy a permanent full-scale milita...,Canadians according to a Japanese textbook,pretty ez ngl...,Detroit illegally overtaxed homeowners $600M r...,Keanu Reeves Films Pulled from Chinese Streami...,TIFU by thinking my son was eating hummus when...,My thick glasses lenses look like ice cubes,ELI5: How can scavenger animals (that feast on...,...,TIL that in the years preceding the French Rev...,A man boarded an airplane and took his seat. A...,[OC] I think I have a new roommate,Roman Polanski getting a standing ovation at t...,LPT: If you spy an elderly person alone at McD...,"No, I’m not apologizing for reacting negativel...",Pierce Brosnan in 1985 was ridiculously handsome.,"""Whale of a Problem"" by Phillipines Greenpeace...",[OC] Oh Bananada,AITA for favoring my scapegoat brother over hi...


In [7]:
train = pd.DataFrame()
train['title'] = []
for i in df.columns[1:]:
    train[i] = []

for label in df.columns[1:]:
    for sentence in df[label]:
        train = train.append({'title': sentence, label:1},ignore_index=True)
        train = train.fillna(0)

train.head()


Unnamed: 0,title,askreddit,worldnews,funny,gaming,news,movies,tifu,mildlyinteresting,explainlikeimfive,...,todayilearned,jokes,aww,videos,lifeprotips,twoxchromosomes,oldschoolcool,art,dataisbeautiful,amitheasshole
0,Chris Rock is performing a standup gig tonight...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"What is your best insult, WITHOUT using curse ...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"People with ‘street smarts’, what is your most...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Old Redditors, what's a must-know story of Red...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Your toilet is now sentient. Would you prefer ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
possible_labels = ['askreddit',
'worldnews',
'funny',
'gaming',
'news',
'movies',
'tifu',
'mildlyinteresting',
'explainlikeimfive',
'pics',
'todayilearned',
'jokes',
'aww',
'videos',
'lifeprotips',
'twoxchromosomes',
'oldschoolcool',
'art',
'dataisbeautiful',
'amitheasshole']

Preprocessing and Tokenizing

In [10]:
from keras.preprocessing.text import Tokenizer

sentences = train["title"].fillna("NO_TITLE_EMPTY").values
targets = train[possible_labels].values

tokenizer= Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
#sequences
#tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Preprocessing the Data Text

In [11]:
print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])

print("max word index:", max(max(seq) for seq in sequences if len(seq) > 0))


# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

max sequence length: 58
min sequence length: 1
median sequence length: 12
max word index: 7108
Found 7108 unique tokens.


In [12]:
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

In [13]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)

In [20]:
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,)) #Dimensions

x=embedding_layer(input_)

x=Conv1D(128,2,activation="relu")(x)
x=MaxPooling1D(2)(x)

x=Conv1D(128,2,activation="relu")(x)
x=MaxPooling1D(2)(x)

x=Conv1D(128,2,activation="relu")(x)
x=GlobalMaxPooling1D()(x)

x=Dense(128,activation="relu")(x)

output=Dense(len(possible_labels), activation="sigmoid")(x)

In [21]:
model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer='rmsprop',
  metrics=['accuracy']
)

## Train the Model

In [22]:
r = model.fit(
  data,
  targets,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
test_text = "TIL that my neural network sucks."
test_seq = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_seq,maxlen=MAX_SEQUENCE_LENGTH)
subreddits=model.predict(test_pad)[0].round(1)

In [30]:
len(possible_labels)

20

In [31]:
for i in range(0,20):
    print(subreddits[i] ,possible_labels[i])

0.1 askreddit
0.1 worldnews
0.3 funny
0.2 gaming
0.1 news
0.2 movies
0.0 tifu
0.1 mildlyinteresting
0.0 explainlikeimfive
0.1 pics
0.1 todayilearned
0.1 jokes
0.3 aww
0.1 videos
0.1 lifeprotips
0.1 twoxchromosomes
0.0 oldschoolcool
0.0 art
0.0 dataisbeautiful
0.0 amitheasshole
