In [1]:
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import numpy as np
import string

from collections import defaultdict

from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

import joblib
import pickle as pkl

from helper_code import *

In [4]:
model = joblib.load('Data/Model/final_model.joblib')
vectorizer = joblib.load('Data/Vectorizer/final_model.joblib')



### Data Exploration and Analysis

In [None]:
def open_file(filename):
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

In [None]:
data_raw = dict()
data_raw['sk'] = open_file('Data/Sentences/train_sentences.sk')
data_raw['cs'] = open_file('Data/Sentences/train_sentences.cs')
data_raw['en'] = open_file('Data/Sentences/train_sentences.en')

In [None]:
def show_statistics(data):
    for language, sentences in data.items():
        
        number_of_sentences = 0
        number_of_words = 0
        number_of_unique_words = 0
        sample_extract = ''
        
        # take a few minutes to try populate these variables
        
        # here is a hint -- word_list breaks the collections of sentences into a list of words
        word_list = ' '.join(sentences).split()
        
        number_of_sentences = len(sentences)
        number_of_words = len(word_list)
        number_of_unique_words = len(set(word_list))
        sample_extract = ' '.join(sentences[0].split()[0:7])
        
        print(f'Language: {language}')
        print('-----------------------')
        print(f'Number of sentences\t:\t {number_of_sentences}')
        print(f'Number of words\t\t:\t {number_of_words}')
        print(f'Number of unique words\t:\t {number_of_unique_words}')
        print(f'Sample extract\t\t:\t {sample_extract}...\n')

In [None]:
show_statistics(data_raw)

In [None]:
do_law_of_zipf(data_raw)

In [None]:
def preprocess(text):
    '''
    Removes punctuation and digits from a string, and converts all characters to lowercase. 
    Also clears all \n and hyphens (splits hyphenated words into two words).
    
    '''
        
    preprocessed_text = text
    
    preprocessed_text = text.lower().replace('-',' ')
    translation_table = str.maketrans('\n', ' ', string.punctuation+string.digits)
    preprocessed_text = preprocessed_text.translate(translation_table)
    return preprocessed_text

In [None]:
data_preprocessed = {k: [preprocess(sentence) for sentence in v]for k, v in data_raw.items()}

In [None]:
print('raw')
show_statistics(data_raw)
print('\nPreProcessed')
show_statistics(data_preprocessed)

### Naive Bayes Model

In [None]:
sentences_train, y_train = [], []
for k, v in data_preprocessed.items():
    for sentence in v:
        sentences_train.append(sentence)
        y_train.append(k)

In [None]:
vectorizer = CountVectorizer()

In [None]:
x_train = vectorizer.fit_transform(sentences_train)

In [None]:
x_train

In [None]:
#initialize model
naive_classifier = MultinomialNB()
naive_classifier.fit(x_train, y_train)

### Vectorizing validation Data and Evaluating Model

In [None]:
data_val = dict()
data_val['sk'] = open_file('Data/Sentences/val_sentences.sk')
data_val['cs'] = open_file('Data/Sentences/val_sentences.cs')
data_val['en']= open_file('Data/Sentences/val_sentences.en')

data_val_preprocessed = {k: [preprocess(sentence) for sentence in v] for k, v in data_val.items()}

In [None]:
sentences_val , y_val = [], []
for k, v in data_val_preprocessed.items():
    for sentence in v:
        sentences_val.append(sentence)
        y_val.append(k)

In [None]:
x_val = vectorizer.transform(sentences_val)

In [None]:
predictions = naive_classifier.predict(x_val)

In [None]:
plot_confusion_matrix(y_val, predictions, ['sk','cs','en'])

In [None]:
f1_score(y_val, predictions, average='weighted')

### Model Improvisation

In [None]:
naive_classifier = MultinomialNB(alpha=0.0001, fit_prior = False)
naive_classifier.fit(x_train, y_train)

predictions = naive_classifier.predict(x_val)
plot_confusion_matrix(y_val, predictions, ['sk','cs','en'])

In [None]:
f1_score(y_val, predictions, average='weighted')

In [None]:
# taken from https://arxiv.org/abs/1508.07909

import re, collections
def get_stats(vocab):
    pairs = collections.defaultdict(int) 
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq 
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word] 
    return v_out

In [None]:
def get_vocab(data):

    words = []
    for sentence in data:
        words.extend(sentence.split())
        
    vocab = defaultdict(int)
    for word in words:
        vocab[' '.join(word)] += 1
        
    return vocab

In [None]:
vocab = get_vocab(sentences_train)

In [None]:
# also taken from original paper
for i in range(100):
    pairs = get_stats(vocab)
    best = max(pairs, key=pairs.get) 
    vocab = merge_vocab(best, vocab)

In [None]:
merges = defaultdict(int)
for k, v in vocab.items():
    for subword in k.split():
        if len(subword) >= 2:
            merges[subword] += v

In [None]:
merge_ordered = sorted(merges, key=merges.get, reverse=True)

In [None]:
pkl.dump(merge_ordered, open('Data/Auxiliary/merge_ordered.pkl', 'wb'))

In [None]:
def split_into_subwords(text):
    merges = pkl.load(open('Data/Auxiliary/merge_ordered.pkl', 'rb'))
    subwords = []
    for word in text.split():
        for subword in merges:
            subword_count = word.count(subword)
            if subword_count > 0:
                word = word.replace(subword, ' ')
                subwords.extend([subword]*subword_count)
    return ' '.join(subwords)

In [None]:
split_into_subwords('hello my name is vinay')

In [None]:
data_processed_subwords = {k:[split_into_subwords(sentence) for sentence in v] for k, v in data_preprocessed.items()}

In [None]:
show_statistics(data_processed_subwords)

In [None]:
data_train_subwords = []
for sentence in sentences_train:
    data_train_subwords.append(split_into_subwords(sentence))

In [None]:
data_val_subwords = []
for sentence in sentences_val:
    data_val_subwords.append(split_into_subwords(sentence))

In [None]:
vectorizer = CountVectorizer()

In [None]:
X_train = vectorizer.fit_transform(data_train_subwords)
X_val = vectorizer.transform(data_val_subwords)

In [None]:
naive_classifier = MultinomialNB(fit_prior=False)
naive_classifier.fit(X_train, y_train)

In [None]:
predictions = naive_classifier.predict(X_val)

In [None]:
plot_confusion_matrix(y_val, predictions, ['sk', 'cs', 'en'])

In [None]:
f1_score(y_val, predictions, average='weighted')