In [1]:
# import all necessary libraries

import pandas as pd
import re
import numpy as np
import nltk
import string
import html
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import wordnet
from typing import List

from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import spacy
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import SimpleRNN, LSTM
from keras.layers import Flatten, Masking
from keras.utils.vis_utils import plot_model

In [46]:
# read the data

df = pd.read_csv('df_lem.csv', lineterminator='\n', index_col = 0)
df.head(2)

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category,reviews,reviews_new
0,b313ef8ef0d5b64290d3036ff1bbf2d2,감성 라디오 음악도시,미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀만 기다려오다가 6 월...,5,664CCA7142E9AE8,2011-09-14T13:25:46-07:00,442838670,fm-%EC%9D%8C%EC%95%85%EB%8F%84%EC%8B%9C-%EC%A2...,https://podcasts.apple.com/us/podcast/fm-%EC%9...,FM 음악도시(종영),music,감성 라디오 음악도시 미국 서부에 있는 유학생이에요. 성시경씨 제대 후 라디오 복귀...,감성 라디오 음악도시 미국 서부에 있는 유학생이에요 . 성시경씨 제대 후 라디오 복...
1,abfb842993be20d21bfae7103addc5e9,They’ve really cut back on the content this se...,Last season there was a new pod every 3-4 days...,1,AD790CE113DCBC1,2018-04-11T13:46:47-07:00,1015394113,the-good-phight-for-philadelphia-phillies-fans,https://podcasts.apple.com/us/podcast/the-good...,The Good Phight: for Philadelphia Phillies fans,sports,They’ve really cut back on the content this se...,they ’ ve really cut back on the content this ...


In [47]:
df["category"].value_counts(normalize=True)

comedy        0.16038
society       0.12890
news          0.10412
business      0.07566
sports        0.07178
arts          0.06362
education     0.05976
crime         0.05042
health        0.04706
tv            0.04354
religion      0.04186
leisure       0.03452
history       0.02834
kids          0.02448
music         0.01782
science       0.01640
fiction       0.01552
government    0.00826
technology    0.00756
Name: category, dtype: float64

In [54]:
# merge similar categories into a new or existing category, for minor categories we categorize them as 'others'

import re
def replace_cat(line):
    line = re.sub(r'\b(society|religion|government|history|education|kids)\b', 'society', line)
    line = re.sub(r'\b(tv|leisure|sports|music|fiction|arts)\b', 'entertainment', line)
    line = re.sub(r'\b(science|technology|health|crime)\b', 'others', line)
    return line

In [55]:
df['category'] = df['category'].apply(lambda x: replace_cat(x))

In [56]:
df["category"].value_counts(normalize=True)

society        0.29160
entertament    0.24680
comedy         0.16038
others         0.12144
news           0.10412
business       0.07566
Name: category, dtype: float64

In [58]:
# Concat review title and review content to get more information later

df['reviews_title'] = df['reviews_new'] + ' ' + df['podcast_title']
df['reviews_title'] = df['reviews_title'].astype(str)
df['reviews_title'] = df['reviews_title'].apply(lambda x: x.lower())

In [59]:
# Regex and text preprocessing

def word_replace(line):
    line = re.sub(r'\b(pod(s?|casts?)|listen|love|great|episodes?|just|good|make|time|really)\b', '', line)
    line = re.sub(r'\b(story|talk|people|host|guy|say|don|know|way|work|want|need|best|new|life)\b', '', line)
    line = re.sub(r'\b(guest|thing|think|feel|look|come|use|year|minutes?|lot|thank|favorite)\b', '', line)
    return line

In [60]:
df['reviews_title'] = df['reviews_title'].apply(lambda x: word_replace(x))

In [61]:
labels = df["category"]
docs = df["reviews_title"]

In [62]:
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))

In [63]:
# We use spacy package to remove the stopwords

nlp = spacy.load('en_core_web_md')
stopwords_removed_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), docs))

In [64]:
# Tokenize the text
tokenizer = Tokenizer(num_words=10000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(stopwords_removed_docs)

In [65]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [66]:
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs)

In [67]:
MAX_SEQUENCE_LENGTH = 500
# integer encode the documents
encoded_docs = integer_encode_documents(stopwords_removed_docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [68]:
padded_docs.shape

(50000, 500)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2)

In [70]:
# Toolkit

VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

In [71]:
# Here we use GloVe vectors

def load_glove_vectors():
    embeddings_index = {}
    with open('../datasets/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


In [72]:
 # create a weight matrix for words in training docs
    
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

In [73]:
# define model

def make_classification_rnn_model(plot=False):
    model = Sequential() # keras model
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(6, activation='softmax')) # we changed the number of categories from 19 to 6
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

def make_lstm_classification_model(plot=False):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(LSTM(units=32, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(6, activation='softmax')) # we changed the number of categories from 19 to 6
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [74]:
lstm = make_lstm_classification_model()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 500, 100)          5695400   
                                                                 
 masking_4 (Masking)         (None, 500, 100)          0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                17024     
                                                                 
 dense_8 (Dense)             (None, 16)                528       
                                                                 
 dense_9 (Dense)             (None, 6)                 102       
                                                                 
Total params: 5,713,054
Trainable params: 17,654
Non-trainable params: 5,695,400
_________________________________________________________________


In [75]:
# fit the model

lstm.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdea1e90710>

In [76]:
# evaluate the model

loss, accuracy = lstm.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 74.820000


In [82]:
predictions_lstm = lstm.predict(X_test)



In [97]:
pred = encoder.inverse_transform(predictions_lstm.argmax(axis=1))
true = encoder.inverse_transform(y_test.argmax(axis=1))

In [99]:
# get the confusion matrix

from sklearn.metrics import confusion_matrix
labels = ['society', 'entertainment', 'comedy', 'news', 'business', 'others']
confusion_matrix = confusion_matrix(true, pred, labels=labels)
confusion_matrix

array([[2273,  293,  121,   65,   82,  120],
       [ 241, 1885,  146,   74,   53,   61],
       [ 110,  218, 1143,   30,   25,   25],
       [  77,   96,   30,  835,   20,   14],
       [  84,   98,   23,   18,  474,   33],
       [ 150,  103,   38,   25,   45,  872]])

In [100]:
cmtx = pd.DataFrame(
    confusion_matrix, 
    index=['society', 'entertainment', 'comedy', 'news', 'business', 'others'], 
    columns=['society', 'entertainment', 'comedy', 'news', 'business', 'others']
)
cmtx

Unnamed: 0,society,entertament,comedy,news,business,others
society,2273,293,121,65,82,120
entertament,241,1885,146,74,53,61
comedy,110,218,1143,30,25,25
news,77,96,30,835,20,14
business,84,98,23,18,474,33
others,150,103,38,25,45,872


In [106]:
# calculate roc-auc score

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predictions_lstm, multi_class='ovo')

0.9348939055838481

In [77]:
# Here we try RNN model

rnn = make_classification_rnn_model()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 500, 100)          5695400   
                                                                 
 masking_5 (Masking)         (None, 500, 100)          0         
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                10560     
                                                                 
 dense_10 (Dense)            (None, 16)                1040      
                                                                 
 dense_11 (Dense)            (None, 6)                 102       
                                                                 
Total params: 5,707,102
Trainable params: 11,702
Non-trainable params: 5,695,400
_________________________________________________________________


In [78]:
# fit the model

rnn.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fde8126dc90>

In [81]:
# evaluate the model
loss, accuracy = rnn.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 69.430000


In [107]:
predictions_rnn = rnn.predict(X_test)



In [109]:
# calculate roc-auc score

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predictions_rnn, multi_class='ovo')

0.9087750481287475

In [110]:
pred_rnn = encoder.inverse_transform(predictions_lstm.argmax(axis=1))
true_rnn = encoder.inverse_transform(y_test.argmax(axis=1))

In [111]:
# get the confusion matrix

from sklearn.metrics import confusion_matrix
labels = ['society', 'entertainment', 'comedy', 'news', 'business', 'others']
confusion_matrix_rnn = confusion_matrix(true_rnn, pred_rnn, labels=labels)
confusion_matrix_rnn

array([[2273,  293,  121,   65,   82,  120],
       [ 241, 1885,  146,   74,   53,   61],
       [ 110,  218, 1143,   30,   25,   25],
       [  77,   96,   30,  835,   20,   14],
       [  84,   98,   23,   18,  474,   33],
       [ 150,  103,   38,   25,   45,  872]])

In [112]:
cmtx = pd.DataFrame(
    confusion_matrix_rnn, 
    index=['society', 'entertainment', 'comedy', 'news', 'business', 'others'], 
    columns=['society', 'entertainment', 'comedy', 'news', 'business', 'others']
)
cmtx

Unnamed: 0,society,entertament,comedy,news,business,others
society,2273,293,121,65,82,120
entertament,241,1885,146,74,53,61
comedy,110,218,1143,30,25,25
news,77,96,30,835,20,14
business,84,98,23,18,474,33
others,150,103,38,25,45,872
