In [1]:
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
from sklearn.model_selection import train_test_split
import os

os.environ['KERAS_BACKEND']='cntk'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.layers import merge
import pandas as pd
import numpy as np
import glob

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

  from ._conv import register_converters as _register_converters
Using CNTK backend


In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [3]:
BASE_PATH = "C:/Users/pranjal/Desktop/kettle/model_data/"
SCIENCE_PATH = BASE_PATH + "science/*.txt"
TECH_PATH = BASE_PATH + "technology/*.txt"
science_files = glob.glob(SCIENCE_PATH)
tech_files = glob.glob(TECH_PATH)
science_file_list = []
tech_file_list = []
for file_name in science_files:
    with open(file_name, "rb") as f:
        science_file_list.append(" ".join(f.readlines()))

for file_name in tech_files:
    with open(file_name, "rb") as f:
        tech_file_list.append(" ".join(f.readlines()))
        
X = science_file_list + tech_file_list
y = ([0] * len(science_file_list)) + ([1] * len(tech_file_list))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [4]:
texts = X_train
labels = y_train

In [5]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)



In [6]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 48880 unique tokens.


In [7]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

('Shape of data tensor:', (879L, 1000L))
('Shape of label tensor:', (879L, 2L))


In [8]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [9]:
print('Number of positive and negative reviews in traing and validation set ')
print y_train.sum(axis=0)
print y_val.sum(axis=0)

Number of positive and negative reviews in traing and validation set 
[384. 320.]
[93. 82.]


In [10]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

In [11]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [12]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [13]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=20, batch_size=128)

  


Train on 704 samples, validate on 175 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0xbe7f4a8>

In [16]:
pred = model.predict(x_train)
cnt = 0
for i in range(0, len(pred)):
    if pred[i][0] >= 0.3 and y_train[i][0] == 1:
        cnt += 1
    elif pred[i][1] >= 0.3 and y_train[i][1] == 1:
        cnt += 1

print cnt, len(pred), float(cnt) / len(pred)

652 704 0.926136363636


In [17]:
y_actual = []
y_model = []
for item in y_train:
    if item[0] == 1:
        y_actual.append(0)
    else:
        y_actual.append(1)
for item in pred:
    y_model.append(item[0])

In [19]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_actual, y_model, pos_label=0)
metrics.auc(fpr, tpr)

0.9876627604166667

In [1]:
x=[1,2,3]
y=[14,5]
print x+y

[1, 2, 3, 14, 5]
