In [8]:
import warnings 
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from cleantext import clean
import fasttext

### Uploading data

In [9]:
train = pd.read_csv('public_data/train_data.csv')
train_solution = pd.read_csv('public_data/train_solution.csv')
train['category'] = train_solution['category']
test = pd.read_csv('public_data/test_data.csv')
subm = pd.read_csv('public_data/sample_submission.csv')

Также загрузим дополнительные данные из телеграма (парсинг) и большой датасет из kaggle с текстами разной тематики.

In [10]:
ad = pd.read_csv('public_data/tg.csv').dropna().drop(columns=['Unnamed: 0'])
bloggers = pd.read_csv('public_data/blogtext.csv')

In [11]:
train_tg = pd.concat([train['message'], ad['message'], bloggers['text'].sample(100000)], ignore_index=True, axis=0)

### Getting word embeddings

Сохраним очищенные сообщения для обучения эмбеддингов.

In [13]:
from tqdm.auto import tqdm

with open('cleaned.txt', 'w') as f:
    for i in tqdm(train_tg):
        f.write(clean(i, no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, no_numbers=True,
             no_emoji=True))

  0%|          | 0/130374 [00:00<?, ?it/s]

Обучим fasttext на большом (относительно исходной выборки) корпусе текстов.

In [14]:
ft_model = fasttext.train_unsupervised('cleaned.txt', dim=100, verbose=2, maxn=3)

Read 20M words
Number of words:  65596
Number of labels: 0
Progress: 100.0% words/sec/thread:   48618 lr:  0.000000 avg.loss:  1.608138 ETA:   0h 0m 0s  0.9% words/sec/thread:   52692 lr:  0.049543 avg.loss:  1.903321 ETA:   0h 4m39s  4.1% words/sec/thread:   51995 lr:  0.047927 avg.loss:  1.542269 ETA:   0h 4m34s 63.5% words/sec/thread:   47894 lr:  0.018257 avg.loss:  1.952261 ETA:   0h 1m53s100.0% words/sec/thread:   48618 lr: -0.000003 avg.loss:  1.608138 ETA:   0h 0m 0s


Пара вспомогательных функций

In [28]:
sw = stopwords.words('english')

def my_tok_and_clean(x):
    a = word_tokenize(clean(x, no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, no_numbers=True,
             no_emoji=True))
    return a#[w for w in a if w not in sw]

In [29]:
train['cleaned'] = train['message'].apply(my_tok_and_clean)

In [30]:
from scipy.spatial.distance import cosine

my_future = (ft_model.get_word_vector('my') + ft_model.get_word_vector('future')) / 2
def get_similarity(x):
    l = len(x)
    vec = np.zeros(100)
    for i in x:
        vec += ft_model.get_word_vector(i)
    return 1 - cosine(vec / l, my_future)

In [31]:
train['similarity'] = train['cleaned'].apply(get_similarity)

In [32]:
train.sort_values(by=['similarity'], ascending=False)['message'][:20]

3460                                                  😏❗️
3496    (Changing this post of my mommy's dream of gra...
3        Let's continue😉. I present to you my new review 
4                         Here comes your future palette.
1621                           Is Macdonald's our future?
3785                        Clarification from my friend.
1391    In short, all my personal reasoning and my rea...
1818          Separately, I will note my favorite way of 
3475                      Our future in the end of tunnel
1316     In the meantime, it's my friend Natasha's anger.
3684    Now it's time to open the first map of my inte...
2942    I don't know about yours, and my plans for tom...
1513    I want to tell you from the bottom of my heart...
1717    Everything in this world is changing, except f...
653     In Moscow, this morning is as gray as your Sun...
449     Good afternoon, friends! Every day, during the...
1071    Let's think about how to do future classes, ne...
3394    To mak