# Text Classification Tutorial
based on https://www.opencodez.com/python/text-classification-using-keras.htm

- We're about to classify articles of 50 authors
- Dataset (http://archive.ics.uci.edu/ml/datasets/Reuter_50_50): contains training folder and test folder

In [2]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import metrics
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Retrieve Data

In [77]:
# For reproducibility
np.random.seed(1237)
 
# Source file directory
path_train = "./resources/C50/C50train"
 
files_train = skds.load_files(path_train,load_content=False)

In [78]:
type(files_train)

sklearn.utils.Bunch

In [79]:
file_paths = files_train.filenames #array with paths to every file

label_names = files_train.target_names #all labels

labelled_files_index = files_train.target #array with numerical represention of label for each file

 



In [89]:
labelled_files_index[2500]

IndexError: index 2500 is out of bounds for axis 0 with size 2500

In [91]:
data_tags = ["filename","category","article"]
data_list = []

# Read and add data from file to a list
i=0
for f in file_paths:
    data_list.append((f,
                      label_names[labelled_files_index[i]],
                      Path(f).read_text()))
    i += 1
    
 
# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)

In [92]:
data.head()

Unnamed: 0,filename,category,article
0,./resources/C50/C50train\AlanCrosby\104278news...,AlanCrosby,The Czech Republic and Spain played to a score...
1,./resources/C50/C50train\TimFarrand\234885news...,TimFarrand,"British brewer-to-leisure group Bass Plc, said..."
2,./resources/C50/C50train\TheresePoletti\136917...,TheresePoletti,Unisys Corp. posted improved results in the th...
3,./resources/C50/C50train\JaneMacartney\247009n...,JaneMacartney,Intense haggling among the world's trade minis...
4,./resources/C50/C50train\JoeOrtiz\100554newsML...,JoeOrtiz,Channel tunnel operator Eurotunnel on Monday a...


## Prepare Data

- we split the training data into 80/20
- each element contains content, tag(category) and file name

### Preprocessing
- tokenization (keras Tokenizer) of the content of each document
- tokenizer transforms each text in a vector by using tfidf weighting
- encoding of tags

In [93]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)
 
train_posts = data['article'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]
 
test_posts = data['article'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [None]:
# 20 news groups
num_labels = 50#20
vocab_size = 15000 #vocabulary is restricted to 15000 words
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)#takes the most [vocab_size] frequent words
tokenizer.fit_on_texts(train_posts)

In [116]:
tokenizer.word_docs #doc_frequency

defaultdict(int,
            {'big': 408,
             'teams': 21,
             'early': 469,
             'over': 1016,
             'keep': 212,
             'our': 567,
             'euro': 6,
             'attacker': 3,
             'martinez': 10,
             'last': 1051,
             "midfielder's": 1,
             'hapless': 3,
             '8': 474,
             'to': 1998,
             'since': 610,
             'have': 1621,
             'campaigns': 16,
             'barjuan': 3,
             'nedved': 4,
             'amor': 4,
             'turnaround': 43,
             'robbed': 2,
             'was': 1685,
             'its': 1633,
             'teenage': 5,
             'in': 1997,
             'added': 579,
             'coach': 5,
             'jiri': 15,
             'could': 893,
             'jan': 39,
             'too': 269,
             'guerrero': 3,
             'karel': 7,
             'clear': 198,
             'and': 1993,
             'strongly': 64,
  

### Vectorization

In [100]:
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')

In [128]:
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

print(encoder.classes_)
print(y_train[0])

['AaronPressman' 'AlanCrosby' 'AlexanderSmith' 'BenjaminKangLim'
 'BernardHickey' 'BradDorfman' 'DarrenSchuettler' 'DavidLawder'
 'EdnaFernandes' 'EricAuchard' 'FumikoFujisaki' 'GrahamEarnshaw'
 'HeatherScoffield' 'JanLopatka' 'JaneMacartney' 'JimGilchrist'
 'JoWinterbottom' 'JoeOrtiz' 'JohnMastrini' 'JonathanBirt' 'KarlPenhaul'
 'KeithWeir' 'KevinDrawbaugh' 'KevinMorrison' 'KirstinRidley'
 'KouroshKarimkhany' 'LydiaZajc' "LynneO'Donnell" 'LynnleyBrowning'
 'MarcelMichelson' 'MarkBendeich' 'MartinWolk' 'MatthewBunce'
 'MichaelConnor' 'MureDickie' 'NickLouth' 'PatriciaCommins'
 'PeterHumphrey' 'PierreTran' 'RobinSidel' 'RogerFillion' 'SamuelPerry'
 'SarahDavison' 'ScottHillis' 'SimonCowell' 'TanEeLyn' 'TheresePoletti'
 'TimFarrand' 'ToddNissen' 'WilliamKazer']
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Model Data

- input layer: Dense (nodes = vocab_size, activation = sigmoid, dropout = 0.3)
- hidden layer: Dense (nodes = 512, activation = sigmoid, dropout = 0.3)
- output layer: Dense (nodes = 512, activation = softmax, dropout = 0.3)

fitting/training of the model with training(X) & test(Y) data

In [131]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(512)) 
model.add(Activation('sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 512)               7680512   
_________________________________________________________________
activation_16 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_17 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 50)                25650     
__________

optimizer, loss, metrics,... TODO

In [139]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', metrics.categorical_crossentropy])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1, #verbosity mode
                    validation_split=0.1)

Train on 1800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2200886b908>

## Evaluation

In [140]:
model.evaluate(x_test, y_test,
                    batch_size=batch_size,
                    verbose=1)



[0.7395891904830932, 0.8580000042915344, 0.7395891904830932]

In [141]:
model.metrics_names

['loss', 'acc', 'categorical_crossentropy']

## Predicting some unseen documents

- files are taken from the test folder
- steps: content tokenization, prediction, comparison with actual tag

In [149]:
test_files = ["./resources/C50/C50test/LydiaZajc/45801newsML.txt",
             "./resources/C50/C50test/LydiaZajc/377881newsML.txt",
             "./resources/C50/C50test/LydiaZajc/416661newsML.txt",
             "./resources/C50/C50test/FumikoFujisaki/416452newsML.txt",
             "./resources/C50/C50test/KouroshKarimkhany/357767newsML.txt"]

x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)
 
x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')

print(x_data_series)
 
i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = encoder.classes_[np.argmax(prediction[0])]
    print("File ->", test_files[i], "Predicted label: " + predicted_label)
    print("********************************")
    i += 1

0    The Toronto Stock Exchange's move to decimal t...
1    The Toronto stock market ended mixed in heavy ...
2    Software maker Cognos Inc sees another strong ...
3    Japanese workers are facing another year of ma...
4    Apple Computer Inc. said Tuesday it will conso...
dtype: object
File -> ./resources/C50/C50test/LydiaZajc/45801newsML.txt Predicted label: LydiaZajc
********************************
File -> ./resources/C50/C50test/LydiaZajc/377881newsML.txt Predicted label: LydiaZajc
********************************
File -> ./resources/C50/C50test/LydiaZajc/416661newsML.txt Predicted label: KouroshKarimkhany
********************************
File -> ./resources/C50/C50test/FumikoFujisaki/416452newsML.txt Predicted label: FumikoFujisaki
********************************
File -> ./resources/C50/C50test/KouroshKarimkhany/357767newsML.txt Predicted label: KouroshKarimkhany
********************************
