In [2]:
import warnings 
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from cleantext import clean
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from gensim.models import Word2Vec
import fasttext
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

### Uploading data

In [7]:
train = pd.read_csv('public_data/train_data.csv')
train_solution = pd.read_csv('public_data/train_solution.csv')
train['category'] = train_solution['category']
test = pd.read_csv('public_data/test_data.csv')
subm = pd.read_csv('public_data/sample_submission.csv')

In [3]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

131072

Также загрузим дополнительные данные из телеграма (парсинг) и большой датасет из kaggle с текстами разной тематики.

In [10]:
ad = pd.read_csv('public_data/tg.csv').dropna().drop(columns=['Unnamed: 0'])
fashion = pd.read_csv('public_data/fashion.csv', engine='python', encoding='utf-8',
                       error_bad_lines=False)
bloggers = pd.read_csv('public_data/blogtext.csv', engine='python', encoding='utf-8',
                       error_bad_lines=False)

Skipping line 15711: field larger than field limit (131072)
Skipping line 16844: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 19370: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 31753: field larger than field limit (131072)
Skipping line 33676: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 65976: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 116130: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 127080: NULL byte detected. This byte cannot be processed in Python's native csv library at the

In [11]:
train_tg = pd.concat([train, ad, fashion], ignore_index=True, axis=0)
y_train_tg = train_tg.category

Для получения более качественных эмбеддингов возьмем 50000 случайных объектов из большого датасета kaggle и добавим к основной выборке.

In [6]:
for_embs = pd.concat([train['message'], ad['message'], fashion['message'],
                      bloggers['text'].sample(50000)], ignore_index=True, axis=0)

In [7]:
for_embs.shape

(105365,)

In [8]:
train_tg.shape

(55365, 4)

Посмотрим на баланс классов

In [9]:
train_tg.category.value_counts() / train_tg.shape[0]

2.0    0.491863
0.0    0.334634
1.0    0.173503
Name: category, dtype: float64

### Text preprocessing

Предобработаем данные и подготовим их для входа в нейронную сеть.

In [10]:
def my_tok_and_clean(x):
    a = clean(x, no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, no_numbers=True,
             no_emoji=True, no_currency_symbols=True)
    return a

In [11]:
train_tg['cleaned'] = train_tg['message'].apply(my_tok_and_clean)
test['cleaned'] = test['message'].apply(my_tok_and_clean)

In [12]:
train_tg['cleaned']

0        over <cur><number> million in bitcoin was elec...
1                                  quiz thursday or friday
2        the australian revenue authority will start co...
3             lets continue i present to you my new review
4                           here comes your future palette
                               ...                        
55360    morgan luke christensen i wonder how you are d...
55361    a friend of mine came with tears i guess i kno...
55362    yuck in aint no fun going to da office when yo...
55363    two of my friends are going to get married to ...
55364    woooo havent blog for awhile its already thurs...
Name: cleaned, Length: 55365, dtype: object

Будем брать не очень длинные последовательности.

In [13]:
lens = train_tg['cleaned'].append(test['cleaned']).apply(len)
t = np.sort(train_tg['cleaned'].append(test['cleaned']).apply(len))[::-1]
t[:15]

array([277119,  90139,  53984,  38015,  31684,  30904,  30656,  30450,
        30259,  28421,  28213,  26240,  25260,  25234,  23257])

In [14]:
MAX_NB_WORDS = 100000
idx = lens <= t[15]
idx1 = train_tg['cleaned'].apply(len) <= t[15]
idx2 = test['message'].apply(len) <= t[15]
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False, char_level=False)
tokenizer.fit_on_texts(train_tg['cleaned'].append(test['cleaned'])[idx]) 
word_seq_train = tokenizer.texts_to_sequences(train_tg['cleaned'][idx1])
word_seq_test = tokenizer.texts_to_sequences(test['cleaned'][idx2])
word_index = tokenizer.word_index

In [15]:
from keras_preprocessing.sequence import pad_sequences

word_seq_train = pad_sequences(word_seq_train, maxlen=t[15])
word_seq_test = pad_sequences(word_seq_test, maxlen=t[15])

### Getting word embeddings

Сохраним очищенные сообщения для обучения эмбеддингов.

In [16]:
from tqdm.auto import tqdm

with open('cleaned.txt', 'w') as f:
    for i in tqdm(for_embs):
        f.write(clean(i, no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, no_numbers=True,
             no_emoji=True))

  0%|          | 0/105365 [00:00<?, ?it/s]

Обучим fasttext

In [17]:
ft_model = fasttext.train_unsupervised('cleaned.txt', dim=100, verbose=2, maxn=3)

Для 2-ого задания в отчете посмотрим на ближайших соседей My future

In [26]:
ft_model.get_nearest_neighbors('my future', k=30)

[(0.8529556393623352, 'futurist'),
 (0.8383951783180237, 'future'),
 (0.8339837789535522, 'futureand'),
 (0.8027969002723694, 'futuristic'),
 (0.8005754351615906, 'futures'),
 (0.7977690100669861, 'futurei'),
 (0.7969520092010498, 'futuresentry'),
 (0.7375408411026001, 'alternate'),
 (0.7328957319259644, 'comiccon'),
 (0.720456063747406, 'counterintuitive'),
 (0.7204479575157166, 'tradingview'),
 (0.7160143256187439, 'portrays'),
 (0.7157045602798462, 'comicbook'),
 (0.7133139371871948, 'fabrication'),
 (0.7107023000717163, 'exposition'),
 (0.7091489434242249, 'promising'),
 (0.7090098857879639, 'confinement'),
 (0.7087546586990356, 'traditionalist'),
 (0.7059422135353088, 'portray'),
 (0.7045783400535583, 'fabricate'),
 (0.7043193578720093, 'premise'),
 (0.7042611241340637, 'eventual'),
 (0.7035579085350037, 'futureurllink'),
 (0.7021069526672363, 'portraits'),
 (0.7017072439193726, 'ciphertrace'),
 (0.700780987739563, 'comical'),
 (0.7006242871284485, 'ventures'),
 (0.698713064193725

In [19]:
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, 100))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = ft_model.get_word_vector(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 8


In [20]:
batch_size = 256 
num_epochs = 8

num_filters = 64 
weight_decay = 1e-4

In [21]:
def baseline_model():
    model = Sequential()
    model.add(Embedding(nb_words, 100,
              weights=[embedding_matrix], input_length=word_seq_train.shape[1], trainable=False))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(3, activation='softmax'))  

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [22]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [23]:
nn_clf = KerasClassifier(build_fn=baseline_model, epochs=num_epochs, batch_size=batch_size, 
                          callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=1)
nn_clf.fit(word_seq_train, y_train_tg)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0743fc8dd0>

In [24]:
result = subm
result['category'] = nn_clf.predict(word_seq_test)
result.to_csv('my_submission.csv')

