In [0]:
!pip install tensorflow-gpu==2.0.0-rc1

In [0]:
!mkdir data

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import nltk
import re

In [0]:
nltk.download('stopwords')

In [0]:
X_train, y_train = np.load('data/X_train.npy', allow_pickle=True), np.load('data/y_train.npy', allow_pickle=True)
X_test, y_test = np.load('data/X_test.npy', allow_pickle=True), np.load('data/y_test.npy', allow_pickle=True)

X_train.shape, X_test.shape

In [0]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def clean_title(title):
    # lower case and remove special characters\whitespaces
    title = re.sub(r'[^a-zA-Z\s]', '', title, re.I|re.A)
    title = title.lower()
    title = title.strip()
    # tokenize document
    tokens = wpt.tokenize(title)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    title = ' '.join(filtered_tokens)
    return title

clean_title = np.vectorize(clean_title)

In [0]:
X_train = clean_title(X_train)
X_test = clean_title(X_test)

In [0]:
X_train[:10]

In [0]:
maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X_train)

In [0]:
def get_features(text_sequence):
    """
    Converts a text sequence to its tokenized version
    and then returns it with padding added 
    """
    sequences = tokenizer.texts_to_sequences(text_sequence)
    return pad_sequences(sequences, maxlen=maxlen)

train_features = get_features(X_train)
test_features = get_features(X_test)

In [0]:
train_features.shape, test_features.shape

In [0]:
mlb = MultiLabelBinarizer()
mlb.fit([y_train])
mlb.transform([y_train])

In [0]:
y_train[:10]

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
import config

In [0]:
filter_length = 300

In [0]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(100, activation='sigmoid'))


In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

In [0]:
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

In [0]:
history = model.fit(train_features, y_train,
                    class_weight=config.class_weights,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

In [0]:
y_train=='[cs.IT, cs.LG, math.IT, stat.ML]'