In [1]:
import glob
from dalab import read_pickle
import pandas as pd
import numpy as np
from langdetect import detect
from collections import Counter
from nltk import word_tokenize

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import spacy
from time import time
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, SimpleRNN, GRU, LSTM
from keras.layers import Input, Dense
from keras.models import Model
from keras.models import Sequential
from matplotlib import pyplot as plt

%matplotlib inline

Using TensorFlow backend.


In [2]:
df = read_pickle('data/20_newsgroup/dataframes/raw_news.pickle')
df = df.sample(frac=1)
df = df.drop_duplicates(subset='text')
df.head()

Unnamed: 0,label,text
19394,misc,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....
7501,crypt,Xref: cantaloupe.srv.cs.cmu.edu sci.crypt:1571...
19374,misc,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...
14804,electronics,Newsgroups: sci.electronics\nPath: cantaloupe....
11602,x,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...


In [3]:
MAXLEN = 1000
VOCAB_SIZE = 20000
TRAIN_SIZE = 15000

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
# Preprocessing here:
# Lower, remove unwanted chars, decide if is going to keep punctuations, lemmas, so on.

In [6]:
all_words = word_tokenize(' '.join(df.text.tolist()))
word_counts = Counter(all_words).most_common(VOCAB_SIZE)
words = [w[0] for w in word_counts]

In [7]:
embed_dic = {}
for index, word in enumerate(words):
    if index % 500 == 0: print(index)
    token = nlp(word)
    embed_dic[token.text] = token.vector

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500


In [8]:
embed_words = pd.DataFrame(embed_dic).T
embed_words.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
,-0.917688,1.59407,6.749008,0.536356,-1.675298,3.894897,-3.105777,2.663857,1.218641,-1.035807,...,0.425276,1.230748,0.118247,-0.219932,0.352065,-0.254422,-0.038656,-1.410054,0.657522,-0.292479
!,0.67114,-0.475781,1.225882,-0.533356,1.413614,2.528172,-0.030113,0.486537,3.412096,1.299003,...,0.361494,0.078374,-0.094767,-0.08782,-0.176552,0.149058,0.22498,-0.329079,0.187947,-0.189483
#,1.499199,-0.151666,2.150062,1.835209,1.904099,2.142193,-1.108657,-1.281631,2.732129,2.948512,...,0.079698,-0.464941,1.290173,0.061074,-0.257399,-0.752442,0.01962,0.132082,-0.44015,-0.476223
$,0.630779,1.138584,2.530838,0.166183,3.076835,0.542186,-0.858887,0.884039,2.754835,-0.390936,...,0.297402,-0.254304,1.426802,-0.010306,-0.657113,-0.627469,0.097199,-0.183204,-0.21361,-0.170229
%,2.040906,0.173398,2.365521,-1.138491,0.034594,2.351219,-2.068765,-0.857941,0.967327,2.1263,...,-0.527834,-0.229152,-0.059828,0.299519,-0.925737,-0.175775,0.280792,0.260768,0.674299,0.6732
&,-0.362176,-1.536422,0.681592,-0.254282,-0.020795,2.54908,1.063519,1.30645,1.050382,2.485573,...,-0.43189,-0.154748,-0.647066,-0.048509,0.02391,-0.560396,0.427393,0.6424,0.882393,-0.388471
',-1.620776,2.052795,0.476201,-0.31558,0.532586,-0.45127,-1.238636,0.606207,-0.797014,0.126661,...,0.054849,-0.057004,0.090347,-0.235629,-0.785747,-0.306805,0.57614,0.273453,0.878228,0.013317
'',-1.88885,-0.329437,2.229829,-0.024572,0.612867,1.830826,-2.658098,1.066225,-0.894128,0.677597,...,-0.093398,-0.030841,-0.173364,-0.138874,-0.683034,-0.094501,0.465635,0.371119,0.759352,0.149494
'*,1.111745,1.290676,-0.382141,0.069083,-0.239147,0.952505,-1.702639,1.894596,1.396065,0.78517,...,-0.027173,0.091146,-0.310856,-0.363376,-0.392699,0.008236,-0.072186,0.011815,0.666233,-0.253421
'+,0.697193,0.72743,0.795636,0.136264,0.777219,0.521605,-0.41175,0.238003,-0.009105,0.776705,...,0.075038,0.04853,0.151208,-0.007558,-0.555188,-0.12478,0.504512,0.432797,1.108591,0.201013


In [9]:
padding = pd.DataFrame({'<PAD>': np.zeros(shape=[1,embed_words.shape[1]])[0]}).T
embed_matrix = padding.append(embed_words)
embed_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
<PAD>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,-0.917688,1.59407,6.749008,0.536356,-1.675298,3.894897,-3.105777,2.663857,1.218641,-1.035807,...,0.425276,1.230748,0.118247,-0.219932,0.352065,-0.254422,-0.038656,-1.410054,0.657522,-0.292479
!,0.67114,-0.475781,1.225882,-0.533356,1.413614,2.528172,-0.030113,0.486537,3.412096,1.299003,...,0.361494,0.078374,-0.094767,-0.08782,-0.176552,0.149058,0.22498,-0.329079,0.187947,-0.189483
#,1.499199,-0.151666,2.150062,1.835209,1.904099,2.142193,-1.108657,-1.281631,2.732129,2.948512,...,0.079698,-0.464941,1.290173,0.061074,-0.257399,-0.752442,0.01962,0.132082,-0.44015,-0.476223
$,0.630779,1.138584,2.530838,0.166183,3.076835,0.542186,-0.858887,0.884039,2.754835,-0.390936,...,0.297402,-0.254304,1.426802,-0.010306,-0.657113,-0.627469,0.097199,-0.183204,-0.21361,-0.170229


In [10]:
random_matrix = np.random.randn(embed_matrix.shape[0], embed_matrix.shape[1])
random_matrix[0] = np.zeros([1, embed_matrix.shape[1]])

In [11]:
word_index = {j:i+1 for i,j in enumerate(embed_matrix.index.tolist()[1:])}

In [12]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index
sequences = tokenizer.texts_to_sequences(df.text)
data = pad_sequences(sequences, maxlen=MAXLEN)

In [13]:
data = data.reshape(-1, data.shape[1], 1)
data = data/VOCAB_SIZE
data[0]

array([[0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.     ],
       [0.

In [14]:
onehot = pd.get_dummies(df['label'])
target_labels = onehot.columns
target = onehot.as_matrix()
target

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [15]:
x_train = data[:TRAIN_SIZE]
x_test = data[TRAIN_SIZE:]

y_train = target[:TRAIN_SIZE]
y_test = target[TRAIN_SIZE:]

In [16]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=data.shape[1:]))
model.add(Dense(128, activation='relu'))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(35))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(target.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1000, 128)         256       
_________________________________________________________________
dense_2 (Dense)              (None, 1000, 128)         16512     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 39, 128)           0         
__________

In [18]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128)

Train on 15000 samples, validate on 4393 samples
Epoch 1/2

KeyboardInterrupt: 

In [19]:
embedding_layer = Embedding(len(embed_matrix), len(embed_matrix.columns), weights=[random_matrix],
                            input_length=data.shape[1:], trainable=False)

sequence_input = Input(shape=(MAXLEN,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Dropout(0.2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
output = Dense(target.shape[1], activation='softmax')(x)

model = Model(sequence_input, output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128)

IndexError: tuple index out of range

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128)

In [None]:
model = Sequential()
model.add(LSTM(128, activation='relu', input_shape=data.shape[1:], return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(target.shape[1], activation='softmax'))

opt = keras.optimizers.Adam(lr=1e-3, decay=1e-5)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128)