In [18]:
import glob
from dalab import read_pickle
import pandas as pd
import numpy as np
from langdetect import detect
from collections import Counter
from nltk import word_tokenize

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import spacy
from time import time
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, SimpleRNN, GRU, LSTM
from keras.layers import Input, Dense
from keras.models import Model
from keras.models import Sequential
from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_pickle('data/medium_stories/dataframes/en_lower_stories.pickle').reset_index(drop=True)
df = df.sample(frac=1)
df = df.drop_duplicates(subset='story')
df.head()

Unnamed: 0,class,story
2841,machine_learning,this article will portray how data related to...
10162,ai,"dear friends, i’m thrilled to announce i am j..."
3332,machine_learning,tl; dr: you can think of machine learning alg...
5641,deep_learning,the question that i get the most from new and...
7372,convolution_neural,"with the rapid advances in ai/ml, it is very ..."


In [3]:
for topic in df['class'].unique():
    print(topic, len(df[df['class'] == topic]))

machine_learning 834
ai 824
deep_learning 770
convolution_neural 194
web_scrape 188
big_data 521
data_extraction 354
web_crawling 147
transfer_learning 498
_speech_recognition 492
reinforcement_learning 163
data_science 823
web_scraping 464
artificial_intelligence 665
machine_translation 271
computer_vision 792
web_crawler 227
genetic_algorithm 226
neural_network 210
natural_language_processing 578
intelligent_machine 220
nlp 657
time_series 769
data_mining 464
recurrent_neural 365
data_engineering 406
image_understanding 121
genetic_programming 63
pattern_recognition 72
evolutionary_computation 17
object_recognition 107
speech_processing 7


In [4]:
MAXLEN = 1000
VOCAB_SIZE = 20000
TRAIN_SIZE = 10000

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
all_words = word_tokenize(' '.join(df.story.tolist()))
word_counts = Counter(all_words).most_common(VOCAB_SIZE)
words = [w[0] for w in word_counts]

In [7]:
embed_dic = {}
for index, word in enumerate(words):
    if index % 500 == 0: print(index)
    token = nlp(word)
    embed_dic[token.text] = token.vector

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500


In [8]:
embed_words = pd.DataFrame(embed_dic).T
embed_words.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
!,0.67114,-0.475781,1.225882,-0.533356,1.413614,2.528172,-0.030113,0.486537,3.412096,1.299003,...,0.361494,0.078374,-0.094767,-0.08782,-0.176552,0.149058,0.22498,-0.329079,0.187947,-0.189483
#,1.499199,-0.151666,2.150062,1.835209,1.904099,2.142193,-1.108657,-1.281631,2.732129,2.948512,...,0.079698,-0.464941,1.290173,0.061074,-0.257399,-0.752442,0.01962,0.132082,-0.44015,-0.476223
$,0.630779,1.138584,2.530838,0.166183,3.076835,0.542186,-0.858887,0.884039,2.754835,-0.390936,...,0.297402,-0.254304,1.426802,-0.010306,-0.657113,-0.627469,0.097199,-0.183204,-0.21361,-0.170229
%,2.040906,0.173398,2.365521,-1.138491,0.034594,2.351219,-2.068765,-0.857941,0.967327,2.1263,...,-0.527834,-0.229152,-0.059828,0.299519,-0.925737,-0.175775,0.280792,0.260768,0.674299,0.6732
&,-0.362176,-1.536422,0.681592,-0.254282,-0.020795,2.54908,1.063519,1.30645,1.050382,2.485573,...,-0.43189,-0.154748,-0.647066,-0.048509,0.02391,-0.560396,0.427393,0.6424,0.882393,-0.388471
',-1.620776,2.052795,0.476201,-0.31558,0.532586,-0.45127,-1.238636,0.606207,-0.797014,0.126661,...,0.054849,-0.057004,0.090347,-0.235629,-0.785747,-0.306805,0.57614,0.273453,0.878228,0.013317
'',-1.88885,-0.329437,2.229829,-0.024572,0.612867,1.830826,-2.658098,1.066225,-0.894128,0.677597,...,-0.093398,-0.030841,-0.173364,-0.138874,-0.683034,-0.094501,0.465635,0.371119,0.759352,0.149494
'd,-2.117164,-1.338601,0.084229,0.462426,-1.943431,-0.868539,-1.988568,-1.283501,1.316628,1.031664,...,0.245045,0.218747,-0.037458,-0.123892,-0.492345,-0.260605,0.047124,-0.004179,0.792069,-0.005892
'll,-1.817729,-0.345378,0.854443,0.764095,-1.413824,-0.242263,-2.870903,1.914835,1.60111,1.654043,...,0.297578,0.11309,0.135073,-0.587226,-0.076004,0.153083,0.157968,0.446649,0.51073,0.062492
'm,0.698488,-0.773644,0.095428,1.95109,1.801322,-0.85654,-1.032917,-1.498475,0.703896,0.657627,...,0.308484,0.174957,0.140864,-0.164224,-0.266943,0.071756,0.262934,0.027962,0.889033,-0.010744


In [9]:
padding = pd.DataFrame({'<PAD>': np.zeros(shape=[1,embed_words.shape[1]])[0]}).T
embed_matrix = padding.append(embed_words)
embed_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
<PAD>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!,0.67114,-0.475781,1.225882,-0.533356,1.413614,2.528172,-0.030113,0.486537,3.412096,1.299003,...,0.361494,0.078374,-0.094767,-0.08782,-0.176552,0.149058,0.22498,-0.329079,0.187947,-0.189483
#,1.499199,-0.151666,2.150062,1.835209,1.904099,2.142193,-1.108657,-1.281631,2.732129,2.948512,...,0.079698,-0.464941,1.290173,0.061074,-0.257399,-0.752442,0.01962,0.132082,-0.44015,-0.476223
$,0.630779,1.138584,2.530838,0.166183,3.076835,0.542186,-0.858887,0.884039,2.754835,-0.390936,...,0.297402,-0.254304,1.426802,-0.010306,-0.657113,-0.627469,0.097199,-0.183204,-0.21361,-0.170229
%,2.040906,0.173398,2.365521,-1.138491,0.034594,2.351219,-2.068765,-0.857941,0.967327,2.1263,...,-0.527834,-0.229152,-0.059828,0.299519,-0.925737,-0.175775,0.280792,0.260768,0.674299,0.6732


In [10]:
word_index = {j:i+1 for i,j in enumerate(embed_matrix.index.tolist()[1:])}
tokenizer = Tokenizer()
tokenizer.word_index = word_index
sequences = tokenizer.texts_to_sequences(df.story)
data = pad_sequences(sequences, maxlen=MAXLEN)

In [11]:
random_matrix = np.random.randn(embed_matrix.shape[0], embed_matrix.shape[1])
random_matrix[0] = np.zeros([1, embed_matrix.shape[1]])

In [12]:
onehot = pd.get_dummies(df['class'])
target_labels = onehot.columns
target = onehot.as_matrix()
target

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=uint8)

In [13]:
x_train = data[:TRAIN_SIZE]
x_test = data[TRAIN_SIZE:]

y_train = target[:TRAIN_SIZE]
y_test = target[TRAIN_SIZE:]

In [21]:
embedding_layer = Embedding(len(embed_matrix), len(embed_matrix.columns), input_length=MAXLEN, weights=[random_matrix],
                           trainable=False)

sequence_input = Input(shape=(MAXLEN,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = Sequential()

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Dropout(0.2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
output = Dense(target.shape[1], activation='softmax')(x)

model = Model(sequence_input, output)

opt = keras.optimizers.Adam(lr=1e-3, decay=1e-5)

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
model.fit(data, target, validation_split=0.2, epochs=2, batch_size=128)

Train on 10007 samples, validate on 2502 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x15735f828>

In [None]:
# model.summary()