## 1. Installing the packages

In [47]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from tensorflow.python.keras import backend
from tensorflow.python.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

In [48]:
import pandas as pd
import numpy as np
import seaborn as sns

## 2. Cleaning the Data

In [49]:
from google.colab import drive #Dataset mounted from google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
df = pd.read_csv('/content/drive/MyDrive/poddf.csv')

In [51]:
df

Unnamed: 0,index,Name,Rating_Volume,Rating,Genre,Description
0,0,Fresh Air,10188,4.46133,Arts,"Fresh Air from WHYY, the Peabody Award-winning..."
1,0,The Moth,10154,4.69982,Performing,"Since its launch in 1997, The Moth has present..."
2,0,99% Invisible,12303,4.8693,Design,"Design is everywhere in our lives, perhaps mos..."
3,0,iFanboy.com Comic Book Podcast,1335,4.79551,Visual,The iFanboy.com Comic Book Podcast is a weekly...
4,0,Myths and Legends,11128,4.88282,Literature,"Jason Weiser tells stories from myths, legends..."
...,...,...,...,...,...,...
13627,0,Puromac,166,4.87952,Tech News,Puromac es una conversación sobre todo el mund...
13628,0,AVexcel,51,4.84314,Tech News,AVexcel is your guide to the best in home thea...
13629,0,Take a Network Break And Get The Latest Networ...,64,4.95313,Tech News,"Stay current with IT news on vendor moves, new..."
13630,0,AVexcel,52,4.82692,Tech News,AVexcel is your guide to the best in home thea...


In [52]:
df.Genre.value_counts().head(5) #visualizing top 5 genres

Business News      249
Investing          245
Comedy             244
Tech News          243
Places & Travel    242
Name: Genre, dtype: int64

In [53]:
temp1=df[df["Genre"] == "Business News"]
temp2=df[df["Genre"] == "Investing"]
temp3=df[df["Genre"] == "Comedy"]
temp4=df[df["Genre"] == "Tech News"]
temp5=df[df["Genre"] == "Places & Travel"]

In [14]:
merged_df = pd.concat([temp1, temp2, temp3,temp4,temp5]) #merge only 5 genres for better accuracy

In [54]:
merged_df.Genre.value_counts()

Business News      249
Investing          245
Comedy             244
Tech News          243
Places & Travel    242
Name: Genre, dtype: int64

In [55]:
df=merged_df

In [56]:
df

Unnamed: 0,index,Name,Rating_Volume,Rating,Genre,Description
1478,0,Marketplace with Kai Ryssdal,2438,4.71575,Business News,Marketplace® is the leading business news prog...
1490,0,WSJ What's News,358,3.97207,Business News,Top stories. Timely insights. Mirrored after t...
1497,0,Bloomberg Surveillance,382,3.86911,Business News,"Tom Keene, Jon Ferro, and Pimm Fox have the ec..."
1498,0,A Conversation with Scott Galloway,9,4.88889,Business News,"Scott Galloway, professor of marketing at NYU ..."
1499,0,Economia,16,4.625,Business News,"Comentários de Bruno Blecher, Carlos Alberto S..."
...,...,...,...,...,...,...
11392,0,The Vegas Tourist Podcast,9,2.44444,Places & Travel,Mark Anthony - Your podcast guide to Las Vegas...
11393,0,Disney Hipster Podcast,243,4.61728,Places & Travel,A podcast critiquing the aesthetic choices of ...
11394,0,Drinky Fun Time,88,4.84091,Places & Travel,It’s about drinking. Featuring celebrity guest...
11395,0,Capture The Magic - Disney World Podcast | Dis...,91,4.81318,Places & Travel,The Capture The Magic Podcast is a fun and inf...


In [57]:
df=merged_df.reset_index(drop=True) #reset the starting index from 0

In [58]:
df

Unnamed: 0,index,Name,Rating_Volume,Rating,Genre,Description
0,0,Marketplace with Kai Ryssdal,2438,4.71575,Business News,Marketplace® is the leading business news prog...
1,0,WSJ What's News,358,3.97207,Business News,Top stories. Timely insights. Mirrored after t...
2,0,Bloomberg Surveillance,382,3.86911,Business News,"Tom Keene, Jon Ferro, and Pimm Fox have the ec..."
3,0,A Conversation with Scott Galloway,9,4.88889,Business News,"Scott Galloway, professor of marketing at NYU ..."
4,0,Economia,16,4.625,Business News,"Comentários de Bruno Blecher, Carlos Alberto S..."
...,...,...,...,...,...,...
1218,0,The Vegas Tourist Podcast,9,2.44444,Places & Travel,Mark Anthony - Your podcast guide to Las Vegas...
1219,0,Disney Hipster Podcast,243,4.61728,Places & Travel,A podcast critiquing the aesthetic choices of ...
1220,0,Drinky Fun Time,88,4.84091,Places & Travel,It’s about drinking. Featuring celebrity guest...
1221,0,Capture The Magic - Disney World Podcast | Dis...,91,4.81318,Places & Travel,The Capture The Magic Podcast is a fun and inf...


In [59]:
def print_plot(index):
    example = df[df.index == index][['Description', 'Genre']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Genre:', example[1])
print_plot(60) #print description and genre at this index

Exclusive, insightful audio interviews by our staff with careers/security leading practitioners and thought-leaders
Genre: Business News


In [60]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [61]:
df = df.reset_index(drop=True)
replace_by_space = re.compile('[/(){}\[\]\|@,;]')
bad_symbols = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    text = text.lower() # lowercase text
    text = replace_by_space.sub(' ', text) # replace 'replace_by_space' symbols by space in text and substitute the matched string with space.
    text = bad_symbols.sub('', text) # remove symbols which are in 'bad_symbols' from text and substitute the matched string with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    return text
df['Description'] = df['Description'].apply(clean_text)
df['Description'] = df['Description'].str.replace('\d+', '')

In [62]:
df

Unnamed: 0,index,Name,Rating_Volume,Rating,Genre,Description
0,0,Marketplace with Kai Ryssdal,2438,4.71575,Business News,marketplace leading business news program nati...
1,0,WSJ What's News,358,3.97207,Business News,top stories timely insights mirrored popular w...
2,0,Bloomberg Surveillance,382,3.86911,Business News,tom keene jon ferro pimm fo economy markets su...
3,0,A Conversation with Scott Galloway,9,4.88889,Business News,scott galloway professor marketing nyu stern f...
4,0,Economia,16,4.625,Business News,comentrios de bruno blecher carlos alberto sar...
...,...,...,...,...,...,...
1218,0,The Vegas Tourist Podcast,9,2.44444,Places & Travel,mark anthony podcast guide las vegas beyond he...
1219,0,Disney Hipster Podcast,243,4.61728,Places & Travel,podcast critiquing aesthetic choices disney co...
1220,0,Drinky Fun Time,88,4.84091,Places & Travel,drinking featuring celebrity guests hosts dan ...
1221,0,Capture The Magic - Disney World Podcast | Dis...,91,4.81318,Places & Travel,capture magic podcast fun informative show dis...


## 3. LSTM Modelling

In [63]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Description'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9271 unique tokens.


In [64]:
X = tokenizer.texts_to_sequences(df['Description'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (1223, 250)


In [65]:
X

array([[   0,    0,    0, ...,  144,  300,  675],
       [   0,    0,    0, ...,  402,  104,  214],
       [   0,    0,    0, ...,   43,    3,  104],
       ...,
       [   0,    0,    0, ..., 3563, 2651, 1235],
       [   0,    0,    0, ...,   34,  262, 1081],
       [   0,    0,    0, ..., 4143, 2526,  363]], dtype=int32)

In [66]:
df

Unnamed: 0,index,Name,Rating_Volume,Rating,Genre,Description
0,0,Marketplace with Kai Ryssdal,2438,4.71575,Business News,marketplace leading business news program nati...
1,0,WSJ What's News,358,3.97207,Business News,top stories timely insights mirrored popular w...
2,0,Bloomberg Surveillance,382,3.86911,Business News,tom keene jon ferro pimm fo economy markets su...
3,0,A Conversation with Scott Galloway,9,4.88889,Business News,scott galloway professor marketing nyu stern f...
4,0,Economia,16,4.625,Business News,comentrios de bruno blecher carlos alberto sar...
...,...,...,...,...,...,...
1218,0,The Vegas Tourist Podcast,9,2.44444,Places & Travel,mark anthony podcast guide las vegas beyond he...
1219,0,Disney Hipster Podcast,243,4.61728,Places & Travel,podcast critiquing aesthetic choices disney co...
1220,0,Drinky Fun Time,88,4.84091,Places & Travel,drinking featuring celebrity guests hosts dan ...
1221,0,Capture The Magic - Disney World Podcast | Dis...,91,4.81318,Places & Travel,capture magic podcast fun informative show dis...


In [67]:
Y = pd.get_dummies(df['Genre']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (1223, 5)


In [68]:
Y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]], dtype=uint8)

In [69]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 7)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1100, 250) (1100, 5)
(123, 250) (123, 5)


In [72]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10
batch_size = 32

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [73]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [79]:
new_summary  = ['I am so hungry']
seq = tokenizer.texts_to_sequences(new_summary )
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['Business News','Investing','Comedy','Tech News','Places & Travel']
print(pred, labels[np.argmax(pred)])


[[0.23002453 0.26610824 0.09657267 0.13509807 0.2721965 ]] Places & Travel


In [75]:
new_summary  = ['I feel really unproductive today. Wish I had done more studies']
seq = tokenizer.texts_to_sequences(new_summary )
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['Business News','Investing','Comedy','Tech News','Places & Travel']
print(pred, labels[np.argmax(pred)])

[[0.22680376 0.16453327 0.14937656 0.32697406 0.13231234]] Tech News
