# Introduction to Neural Networks: Classification

Author: Pierre Nugues

## A Dataset: *Salammbô* in French and English

### Understanding the Dataset

The counts of letters and _A_ broken down by chapter in *Salammbô* in French and its translation in English. The $\mathbf{X}$ matrix contains the counts and $\mathbf{y}$ the language; French: 1 and English: 0.

In [None]:
import numpy as np
np.random.seed(0)

X = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ], dtype='float64')

y = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### Visualizing the Data

In [None]:
import matplotlib.pyplot as plt

X_english, X_french = np.split(X, 2)

en = plt.scatter(X_english[:,0], X_english[:,1], c='r', marker='x')
fr = plt.scatter(X_french[:,0], X_french[:,1], c='b', marker='x')
plt.title("Salammbô")
plt.xlabel("Letter count")
plt.ylabel("$A$ count")
plt.legend((en, fr), ('English', 'French'), loc='lower right', scatterpoints=1)
plt.show()

It is probably possible to determine the language using the counts

## Building a Larger Dataset

Let us extend the dataset to cover all the paragraphs in the novel, French and English, where we will count all the letters. Paragraph will be defined the newline character: `\n`.<br/> Structure of the dataset:<br/>

|Obs.#   | $\mathbf{X}$                              | $\mathbf{y}$  |
|-------|----------------------------------|--------|
|para_1 | a_counts, b_counts, c_counts, ...| English|
|para_2 |a_counts, b_counts, c_counts, ... |English |
|...    |...                               |...    |
|para_1000| a_counts, b_counts, c_counts, ...|French|
|para_1001| a_counts, b_counts, c_counts, ...| French|
|...      |...                               |...     |

### A Function to Count Characters

In [None]:
def count_chars(string):
    counts = {}
    for char in string:
        if char.isalpha():
            if char in counts:
                counts[char] += 1
            else:
                counts[char] = 1
    return counts

### We Extract the Counts

In [None]:
salammbo_en = open('../../EDAN20/programs/corpus/Salammbo/salammbo_en.txt').read().strip().lower()
salammbo_fr = open('../../EDAN20/programs/corpus/Salammbo/salammbo.txt').read().strip().lower()

para_en = filter(None, salammbo_en.split('\n'))
para_fr = filter(None, salammbo_fr.split('\n'))

In [None]:
list(filter(None, salammbo_fr.split('\n')))[:10]

In [None]:
counts_en = list(filter(None, map(count_chars, para_en)))
counts_fr = list(filter(None, map(count_chars, para_fr)))
counts_fr[:5]

In [None]:
counts = counts_en + counts_fr

### Formatting the Dataset

We associate the characters with indices

In [None]:
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)
dict_vec.fit(counts)
dict_vec.vocabulary_

We convert the dictionaries into matrices

In [None]:
X_en = dict_vec.transform(counts_en)
X_fr = dict_vec.transform(counts_fr)
print(X_en.shape)
print(X_fr.shape)

We stack the French and English datasets

In [None]:
X = np.vstack((X_en, X_fr))
y = [0] * len(counts_en) + [1] * len(counts_fr)
X[:5,:]

In [None]:
y[:5]

### Training and Validation Sets

#### We shuffle the indices

In [None]:
indices = list(range(X.shape[0]))
np.random.shuffle(indices)
print(indices[:10])
X = X[indices, :]
y = np.array(y)[indices]

#### We split the dataset

In [None]:
training_examples = int(X.shape[0] * 0.8)

X_train = X[:training_examples, :]
y_train = y[:training_examples]

X_val = X[training_examples:, :]
y_val = y[training_examples:]

### Standardizing the Dataset

We standardize the dataset. This is very significant for the final result

In [None]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(X_train)

X_train = std.transform(X_train)
X_val = std.transform(X_val)
X_train[:3, :]

## A Simple Feed-Forward Network

### The Model

We create a simple architecture. This is simply logistic regression

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers

model = models.Sequential([
    layers.Dense(1, input_dim=X.shape[1], activation='sigmoid')])

# Fitting the network
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

We fit the model

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=32, 
                    validation_data=(X_val, y_val), 
                    verbose=1)

### The Model Weights

In [None]:
model.get_weights()

### Visualizing The Training Process

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()


### Predicting and Evaluating

In [None]:
y_predicted = model.predict(X_val)
print(y_predicted[:10])
print(y_val[:10])

# evaluate the model
scores = model.evaluate(X_val, y_val)
print('Scores:', scores)
"\n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = []
for i in y_predicted:
    if i < 0.5:
        y_pred.append([0])
    else:
        y_pred.append([1])
confusion_matrix(y_val, y_pred)

## A More Complex Model

We build a more complex model with two layers

In [None]:
model = models.Sequential([
    layers.Dense(20, input_dim=X.shape[1], activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Fitting the network
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

And we fit it

In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, 
                    validation_data=(X_val, y_val), verbose=0)

## The Model Weights

In [None]:
model.get_weights()

### Visualizing The Training Process

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

### Predicting

In [None]:
y_predicted = model.predict(X_val)
print(y_predicted[:10])

# evaluate the model
scores = model.evaluate(X_val, y_val)
print(scores)
"\n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)

### Confusion Matrix

In [None]:
y_pred = []
for i in y_predicted:
    if i < 0.5:
        y_pred.append([0])
    else:
        y_pred.append([1])
confusion_matrix(y_val, y_pred)

### For a real system, see the _Compact Language Detector v3_: https://github.com/google/cld3
and a reimplementation: https://github.com/pnugues/language-detector