# Annex-A
This is part of the final project of the Applied Data Science course. In this piece of work, The <b>TITLE</b> column was transformed using word2vec, and the model was trained using Convolutional Neural Networks (CNN).

### Required Imports

In [1]:
from keras.layers import Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import nltk 
from keras.preprocessing.sequence import pad_sequences
from ipywidgets import widgets
from IPython.display import display
from IPython.display import clear_output
import pickle
from lime.lime_text import LimeTextExplainer
from keras.models import load_model

np.random.seed(42)

Using TensorFlow backend.


In [2]:
# It might be needed to download puntk
# nltk.download('punkt')

#### Helper Functions

In [3]:
def class_desc(p_class):
    '''
    This function is to return the description of CATEGORY.
    '''
    if p_class == 'b':
        str = 'Business'
    elif p_class == 't':
        str = 'Science and Technology'
    elif p_class == 'e':
        str = 'Entertainment'
    elif p_class == 'm':
        str = 'Health'
    else:
        p_class = ''
    return str

### Get the Data

In [4]:
news = pd.read_csv("uci-news-aggregator.csv")

### Data Preprocessing

In [5]:
dfnews=news[['CATEGORY','TITLE']].copy()
# change categories to numbers
dfnews.loc[dfnews["CATEGORY"] == "e", "CATEGORY"] = 0
dfnews.loc[dfnews["CATEGORY"] == "b", "CATEGORY"] = 1
dfnews.loc[dfnews["CATEGORY"] == "t", "CATEGORY"] = 2
dfnews.loc[dfnews["CATEGORY"] == "m", "CATEGORY"] = 3

In [6]:
counter = collections.Counter()
maxlen = 0
for i in range(0, len(dfnews)):
    sent = dfnews.iloc[i]['TITLE']
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
            counter[word] += 1


### Data Representation

In [7]:
VOCAB_SIZE=10000

In [8]:
word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v:k for k, v in word2index.items()}

In [9]:
#  Save word2index to Pickle file
output = open('word2index.pkl', 'wb')
pickle.dump(word2index, output)
output.close()

In [10]:
# load word2index Pickle
pkl_file = open('word2index.pkl', 'rb')
word2index = pickle.load(pkl_file)
pkl_file.close()

In [11]:
# Prepare data for the CNN  model
xs, ys = [], []
# with open(INPUT_FILE, "r", encoding='utf-8') as f:
#     for line in f:
for i in range(0, len(dfnews)):
#         label, sent = line.strip().split("\t")
    sent = dfnews.iloc[i]['TITLE']
    label=dfnews.iloc[i]['CATEGORY']
    ys.append(int(label))
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    wids = [word2index[word] for word in words]
    xs.append(wids)

In [12]:
# Setting maxlen 
maxlen=64

### Data Split

In [13]:
X = pad_sequences(xs, maxlen=maxlen)
y = np_utils.to_categorical(ys, num_classes=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(295693, 64) (126726, 64) (295693, 4) (126726, 4)


### CNN Model

In [14]:
# Model paameters
EMBED_SIZE=100
NUM_WORDS=5
NUM_FILTERS=256
NUM_EPOCHS=1
BATCH_SIZE=300


#### Model Training

In [15]:
# Model initialization
model = Sequential()
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen))
#model.add(SpatialDropout1D(Dropout(0.2))) #---- not supported in keras 2
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(4, activation="softmax"))

In [16]:
model.compile(optimizer="adam", loss="binary_crossentropy",
              metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, verbose=True,
                    epochs=NUM_EPOCHS,
                    validation_data=(X_test, y_test))  

Train on 295693 samples, validate on 126726 samples
Epoch 1/1


#### Model Evaluation

In [17]:
score = model.evaluate(X_test, y_test, verbose=True ) # verbose=1 to see output
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))



In [18]:
# save the model
model.save('news_cnn.mdl',overwrite=True,include_optimizer=True)

### Model Deployment

In [19]:
# load the saved model
mymodel =  load_model('news_cnn.mdl')

In [71]:
def pred_cnn(p_title):
    '''
    This function is o do the predction based on loaded  model
    '''
    xs = []
    words = [x.lower() for x in nltk.word_tokenize(p_title)]
    wids  = [word2index[word] for word in words]
    xs.append(wids)
    X = pad_sequences(xs, maxlen=64)
    preds = mymodel.predict_classes(X)
    prob = mymodel.predict_proba(batch_size=65, x=X)
    lst=['e','b','t','m']
    print(p_title)
    print('Category: ',class_desc(lst [preds[0]]))
#     class_names = ['b','e','m','t']
#     explainer = LimeTextExplainer(class_names=class_names)
#     exp = explainer.explain_instance(p_title, mymodel.predict_proba(X), num_features=6, top_labels=4)
#     exp.save_to_file('/tmp/oi.html')
#     exp.show_in_notebook(text=True)

In [72]:
caption = widgets.Label('Enter an article here:')
text = widgets.Textarea()
button = widgets.Button(description="Classify")
display(caption,text,button)

def handle_submit(sender):
    message = text.value
    clear_output()
    pred_cnn(message)
    
button.on_click(handle_submit)

Child’s Play Is Good for All of Us
Category:  Entertainment


Testing Examples:
    - IBM sales has grown 33% in the last 5 years
    - Pearson shareholders reject chief executive's £1.5m pay package 
    - World Bank transfers $5.6 million to PA
    - Child’s Play Is Good for All of Us
    - Why Kids Shouldn’t Sit Still in Class
    - Why Deep Breathing May Keep Us Calm
    - In Rare Unity, Hospitals, Doctors and Insurers Criticize Health Bill
    - Scientists can now count birds from space
    - What to expect with the iPhone 8
    - See Mark Hamill prank 'Star Wars' superfans
    - Lyft and Waymo Reach Deal to Collaborate on Self-Driving Cars