In [10]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from numpy import asarray
from numpy import zeros
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [2]:
data_train=pd.read_csv('train.csv',header=None)
data_train.columns=['class','title','content']
data_test=pd.read_csv('test.csv',header=None)
data_test.columns=['class','title','content']

In [3]:
data_train.isnull().any()

class      False
title      False
content    False
dtype: bool

In [4]:
content_train = []
for i in range(0,len(data_train['title'])):
    A = data_train['content'][i].split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    A = [re_punc.sub('', w) for w in A]
    # remove non-printatble characters
    re_print = re.compile('[^%s]'%re.escape(string.printable))
    A = [re_print.sub('', w) for w in A]
    # convert to lower case
    A = [word.lower() for word in A]
    # remove remaining tokens that are not alphabetic
    A = [word for word in A if word.isalpha()]
    # filter out stopwords
    stop_words = set(stopwords.words('english'))
    A = [w for w in A if not w in stop_words]
    # filter out short tokens
    A = [word for word in A if len(word) > 1]
    A = " ".join(A)
    content_train.append(A)

In [5]:
content_test = []
for i in range(0,len(data_test['title'])):
    A = data_test['content'][i].split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    A = [re_punc.sub('', w) for w in A]
    # remove non-printatble characters
    re_print = re.compile('[^%s]'%re.escape(string.printable))
    A = [re_print.sub('', w) for w in A]
    # convert to lower case
    A = [word.lower() for word in A]
    # remove remaining tokens that are not alphabetic
    A = [word for word in A if word.isalpha()]
    # filter out stopwords
    stop_words = set(stopwords.words('english'))
    A = [w for w in A if not w in stop_words]
    # filter out short tokens
    A = [word for word in A if len(word) > 1]
    A = " ".join(A)
    content_test.append(A)

In [6]:
class_train = data_train['class']
class_test = data_test['class']

In [7]:
# load embedding as a dict
def load_embedding(filename):
# load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
    # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [8]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
# total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

In [11]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(content_train)
 
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(content_train)
# pad sequences
max_length = max([len(s.split()) for s in content_train])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = to_categorical(class_train)

In [12]:
# test
encoded_docs = tokenizer.texts_to_sequences(content_test)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = to_categorical(class_test)


In [13]:
from gensim.models import Word2Vec
# train word2vec model
model_train = Word2Vec(content_train, size=100, window=2, workers=4, min_count=2)
    
filename = 'embedding_word2vec.txt'
model_train.wv.save_word2vec_format(filename, binary=False)

unable to import 'smart_open.gcs', disabling that module


In [14]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)


In [None]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(5, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 85, 100)           4979900   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 81, 128)           64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 40, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 5120)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 25605     
Total params: 5,069,633
Trainable params: 89,733
Non-trainable params: 4,979,900
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Ep