# Preprocessing

## Functions used in this section

In [None]:
from sklearn.model_selection import train_test_split

# Generate a python dictionary of (character bigram : frequency)
# pairs, can be used to construct multi-lingual dictionary.
# Parameters:
#   text        - Language text to analyse.
#   bigramFreq  - An input dictionary that can be built upon.
def makeFreqDict(text, bigramFreq):
    for i in range(1, len(text)):
        bigram = str(text[i-1] + text[i])
        bigramFreq[bigram] = 0
    return bigramFreq


# Count the frequency of character bigrams and increment the
# appropriate attribute of the dictionary.
# Parameters:
#   text          - Language text to count the frequencies of.
#   templateDict  - The multi-lingual dictionary created via  makeFreqDict.
def fillFreqDict(text, templateDict):
    bigramFreq = templateDict.copy()

    for i in range(1, len(text)):
        bigram = str(text[i-1] + text[i])
        if bigram in bigramFreq.keys():
            bigramFreq[bigram] = bigramFreq[bigram] + 1
    return list(bigramFreq.values())



# Split the data by a number of characters and then count
# the frequency of bigrams within each split.
# Parameters:
#   data      - Data of large volume that will be split.
#   freqDict  - Multi-lingual template dictionary to count frequencies using.
#   label     - Label detailing the origin language of the data.
#   splitSize - The size of data samples to be taken from data.
def itemizeData(data, freqDict, label, splitSize):
  dataSplits = []
  items = []

  for i in range(0, len(data)-splitSize, splitSize):
    dataSplits.append(data[i:i+splitSize])

  for i in range(0, len(dataSplits)):
    freq = fillFreqDict(dataSplits[i], freqDict)
    freq.append(label)
    items.append(list(freq))
  return items



# Split the data into distinct training and testing sets.
# Parameters:
#   data - Language data to be split.
def ttSplit(data):
  train, test = train_test_split(dataSplits, test_size=0.3, random_state=42, shuffle=True)
  return train, test

## Reading the data

In [None]:
import re
import numpy as np

regexCond = "<[\w]+>|</[\w]+>|<+[\w]+ \w+=[^<^>]+>|\n"      # Regex string that allows finding xml tags in language corpuses.

file = open("english.txt", "r")                             # Reading in the english corpus.
engData = file.read()
file.close()

file = open("czech.txt", "r", encoding="utf-16")            # Reading in the czech corpus.
czechData = file.read()
file.close()

file = open("combined.txt", "r", encoding="utf-16")         # Reading in the igbo corpus.
igboData = file.read()
file.close()

## Using regex to remove XML

In [None]:
engData = re.sub(regexCond, "", engData)                      # Using regex string to remove xml tags from english corpus.
czechData = re.sub(regexCond, "", czechData)                  # Using regex string to remove xml tags from czech corpus.
igboData = re.sub(regexCond, "", igboData)                    # Using regex string to remove xml tags from igbo corpus.

print(len(engData) + len(czechData) + len(igboData))      

13695123


## Find all potential bigrams

In [None]:
engFreq = makeFreqDict(engData, {})                         # Start making a multi-lingual bigram character dictionary.
engCzechFreq = makeFreqDict(czechData, engFreq)             # Add the bigrams found in czech that don't exist in english.
fullFreq = makeFreqDict(igboData, engCzechFreq)             # Add the bigrams found in igbo that don't exist in english or czech.

engTrain, engTest = ttSplit(engData)                        # Split english corpus into training and testing sets.
czechTrain, czechTest = ttSplit(czechData)                  # Split czech corpus into training and testing sets.
igboTrain, igboTest = ttSplit(igboData)                     # Split igbo corpus into training and testing sets.

engTrain = fillFreqDict(engTrain, fullFreq)                 # Creating the english training data.
engTrain.append(0)                                          # Adding the label to the training data (not used - just for sanity).
czechTrain = fillFreqDict(czechTrain, fullFreq)             # Creating the czech training data.
czechTrain.append(1)                                        # Adding the label to the training data (not used - just for sanity).
igboTrain = fillFreqDict(igboTrain, fullFreq)               # Creating the igbo training data.
igboTrain.append(2)                                         # Adding the label to the training data (not used - just for sanity).

engTest = itemizeData(engTest, fullFreq, 0, 500)            # Count the frequencies of bigrams in each X character split using english test data.
czechTest = itemizeData(czechTest, fullFreq, 1, 500)        # Count the frequencies of bigrams in each X character split using czech test data.
igboTest = itemizeData(igboTest, fullFreq, 2, 500)          # Count the frequencies of bigrams in each X character split using igbo test data.

trainData = np.vstack([engTrain, czechTrain])               # Create a training set where the first row is english corpus frequencies and the
trainData = np.vstack([trainData, igboTrain])               # second and third rows are czech and igbo frequencies respectively.

trainLabels = trainData[:, -1]                              # Harvest the training data labels from the training data.
trainLabels = trainLabels.reshape(trainLabels.shape[0])     # Reshape the training data labels into a single row format.
trainData = np.delete(trainData, -1, 1)                     # Delete the training data labels from the training data.

testData = np.vstack([engTest, czechTest])                  # Create a testing set where the first row is english corpus frequencies and the
testData = np.vstack([testData, igboTest])                  # second and third rows are czech and igbo frequencies respectively.

testLabels = testData[:,-1]                                 # Harvest the testing data labels from the test data.
testLabels = testLabels.reshape(testLabels.shape[0])        # Reshape the testing data labels into a single row format.
testData = np.delete(testData, -1, 1)                       # Delete the testing data labels from the test data.

print(testData.shape)


(5477, 6950)


# Training

## Creating the model

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mode 


# Compare two data sets using cosine similarity to
# determine the origin language of the first data set.
# Parameters:
#   test  - The test data to be classified.
#   train - The training data to be used for comparison. 
def similarityTest(test, train):
  similarity = cosine_similarity(test, train)               # Create a similarity matrix where each row is the similarity
  predictions = []                                          # between one test sample and each training sample.

  for row in similarity:
    predictions.append(np.argmax(row, axis=0))              # Find which training sample is closest to the test sample and
  return predictions                                        # make that training samples label the test samples label.


predictions = similarityTest(testData, trainData)           # Getting the prediction for each test data sample.
acc = accuracy_score(testLabels, predictions)               # Calculating the accuracy using the predictions
print(acc)                                                  # and the known correct labels.
f1 = f1_score(testLabels, predictions, average="macro")     # Calculating the f1 score using the predictions
print(f1)                                                   # and the known correct labels.

1.0
1.0
