In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from helpers import get_Xy
from termcolor import colored

import numpy as np

Setting up experiments seed.

In [3]:
seed = 1

np.random.seed(seed)

# Split and preproccess text data

Getting data.

*processed_data_list* contains a list of sentences list. The top-level index is used for getting all sentences from the file. The second level index is used for getting sentences from a particular file.

*binary_labels_lists* has the same structure. But contains **0** if sentence if normal and **1** if the sentence contains ads.

In [4]:
processed_data_list, binary_labels_lists = get_Xy(return_raw=False)

Checking that numbers of elements same.

In [5]:
len(processed_data_list), len(binary_labels_lists)

(37, 37)

Looking at the data. Red text is an advertisement.

Changing *file_number* - to look to the other file.

In [6]:
file_number = 15

for sent, label in zip(processed_data_list[file_number], binary_labels_lists[file_number]):
    if label == 1:
        print(colored(sent, 'red'))
    else:
        print(sent)

[31mno one likes to feel stuck especially by your cloud but the ibm cloud is the most open and secure public cloud for business[0m
[31mit can manage all your apps and data anywhere[0m
[31msmart loves problems[0m
[31mibm lets put smart to work[0m
[31mvisit ibm dot com slash flexible[0m
[31mhello this is ron burgundy and you are listening to my voice which commands trust and respect[0m
[31mguess what[0m
[31mmy podcast is back and thats a win for everyone[0m
[31mif youre a longtime listener to the show you probably already know the deal[0m
[31meach week i bring you hard hitting journalism and also light entertainment[0m
[31mi contain multitudes[0m
[31mfind the ron burgundy podcast on the i heart radio app apple podcast or wherever you get your podcasts[0m
[31mwelcome to step you should know a production of i heart radios how stuff works[0m
hey welcome to the podcast
im josh clark
theres charles to chuck
theres guest producer role over there
and that makes this st

Split sentences numbers into train/test sets

In [7]:
from sklearn.model_selection import train_test_split
train_indexes, test_indexes = train_test_split(np.arange(len(processed_data_list)))
print("train files", train_indexes)
print("test files", test_indexes)

train files [30 17 28 34 31 26  4 14 10 33 23 32 20 18  6 13  7 36  1 16  0 15  5 11
  9  8 12]
test files [ 2 29  3 22 25 27 21 35 19 24]


In [8]:
X_raw_train = [processed_data_list[i] for i in train_indexes]
X_raw_test = [processed_data_list[i] for i in test_indexes]

y_train_list = [binary_labels_lists[i] for i in train_indexes]
y_test_list = [binary_labels_lists[i] for i in test_indexes]

Checking the number of files with positive labels in train and test sets.

In [9]:
def number_of_files_with_ads(labels_list):
    number_of_ads = 0
    for labels in labels_list:
        if 1 in labels:
            number_of_ads += 1
    return number_of_ads

print(number_of_files_with_ads(y_train_list), len(train_indexes))
print(number_of_files_with_ads(y_test_list), len(test_indexes))

23 27
10 10


# Creating fasttext embeddings

Creating a flat list with sentences from all training data.

In [10]:
from itertools import chain

flat_train_data = [x for X in X_raw_train for x in X]

In [11]:
len(flat_train_data)

18941

Training fasttext on our corpus.

In [12]:
from fasttext_classifier import FastTextClassifier

In [13]:
ft = FastTextClassifier(verbose=1, epoch=200, minn=1, maxn=6, dim=100, model='skipgram')

In [14]:
ft.fit(flat_train_data)

Number of sentences: 18941
Train unsupervised model.
Model trained with user parameters:
{'verbose': 1, 'epoch': 200, 'minn': 1, 'maxn': 6, 'dim': 100, 'model': 'skipgram'}.


FastTextClassifier(bucket=2000000, dim=100, epoch=200, label='__label__',
                   loss='ns', lr=0.05, lrUpdateRate=100, maxn=6, minCount=1,
                   minCountLabel=0, minn=1, model='skipgram', neg=5,
                   pretrainedVectors='', t=0.0001, thread=12, verbose=1,
                   wordNgrams=1, ws=5)

Creating sentence embedding. Each sentence from data set transforming into vector.

*X_train_list*, *X_test_list* - list with data matrixes (Number_of_sentences X number_of_ft_features)

In [15]:
from typing import List

def create_sentences_embeddings(processed_data_list: List[List[str]], indexes: List[int], ft: FastTextClassifier) -> List[np.ndarray]:
    X_emb_sentences = []
    for i in indexes:
        sentenses = processed_data_list[i]
        X_emb_sentences.append(ft.transform(sentenses))
    return X_emb_sentences

In [16]:
X_train_list = create_sentences_embeddings(processed_data_list, train_indexes, ft)

Looking at training files shape.

In [17]:
def print_embed_data(X_list, y_list, processed_data_list, indexes):
    for i in range(len(X_list)):
        print("{0}: matrix shape {1}, # sents {2}, len(y) = {3}".format(i, X_list[i].shape, len(processed_data_list[indexes[i]]), len(y_list[i])))

print_embed_data(X_train_list, y_train_list, processed_data_list, train_indexes)

0: matrix shape (465, 100), # sents 465, len(y) = 465
1: matrix shape (173, 100), # sents 173, len(y) = 173
2: matrix shape (436, 100), # sents 436, len(y) = 436
3: matrix shape (260, 100), # sents 260, len(y) = 260
4: matrix shape (609, 100), # sents 609, len(y) = 609
5: matrix shape (557, 100), # sents 557, len(y) = 557
6: matrix shape (294, 100), # sents 294, len(y) = 294
7: matrix shape (1093, 100), # sents 1093, len(y) = 1093
8: matrix shape (78, 100), # sents 78, len(y) = 78
9: matrix shape (1538, 100), # sents 1538, len(y) = 1538
10: matrix shape (888, 100), # sents 888, len(y) = 888
11: matrix shape (237, 100), # sents 237, len(y) = 237
12: matrix shape (1133, 100), # sents 1133, len(y) = 1133
13: matrix shape (854, 100), # sents 854, len(y) = 854
14: matrix shape (921, 100), # sents 921, len(y) = 921
15: matrix shape (369, 100), # sents 369, len(y) = 369
16: matrix shape (600, 100), # sents 600, len(y) = 600
17: matrix shape (726, 100), # sents 726, len(y) = 726
18: matrix sha

In [18]:
X_test_list = create_sentences_embeddings(processed_data_list, test_indexes, ft)

In [19]:
print_embed_data(X_test_list, y_test_list, processed_data_list, test_indexes)

0: matrix shape (378, 100), # sents 378, len(y) = 378
1: matrix shape (304, 100), # sents 304, len(y) = 304
2: matrix shape (493, 100), # sents 493, len(y) = 493
3: matrix shape (610, 100), # sents 610, len(y) = 610
4: matrix shape (1928, 100), # sents 1928, len(y) = 1928
5: matrix shape (1005, 100), # sents 1005, len(y) = 1005
6: matrix shape (719, 100), # sents 719, len(y) = 719
7: matrix shape (852, 100), # sents 852, len(y) = 852
8: matrix shape (416, 100), # sents 416, len(y) = 416
9: matrix shape (852, 100), # sents 852, len(y) = 852


Saving data

In [20]:
import pickle
with open("sent.pkl", 'wb') as f:
    pickle.dump( (X_train_list, X_test_list, y_train_list, y_test_list) , f)