In [1]:
import pymorphy2
import pandas as pd
from ast import literal_eval
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/input.tsv', sep='\t', names=['id', 'category', 'tokens', 'vector'], index_col=0, converters={'vector': literal_eval, 'tokens': literal_eval})
df.head()

Unnamed: 0_level_0,category,tokens,vector
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2673515,0,"[вже, більше, місяця, під, під’їздом, будинку,...","[0.18560213, -0.3210541, -0.1666596, -0.774903..."
3301972,0,"[сміттєві, баки, знаходятся, меньше, ніж, 10, ...","[0.36674336, 0.012066948, -0.53846943, -0.2465..."
2810263,0,"[сміттєві, контейнери, постійно, знаходяться, ...","[-0.15331537, -0.8035142, -0.059553355, -0.806..."
2828943,0,"[станом, на, 26.07.2014, сміттєві, контейнери,...","[0.38238603, -0.35925356, -0.5738647, -0.57135..."
2865598,0,"[вопрос, мусорных, баков, не, решен, !!!, поче...","[0.38771498, 0.20435554, -0.58337885, -0.13807..."


In [3]:
df.shape

(61760, 3)

### KNN with cosine distance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(list(df['vector']), list(df['category']), random_state=1, shuffle=True)

neighbours = NearestNeighbors(n_neighbors=1, metric='cosine')
neighbours.fit(X_train)
y_pred = [y_train[neighbour[0]] for neighbour in neighbours.kneighbors(X_test, return_distance=False)]
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.38      0.30        60
           1       0.33      0.12      0.18        32
           2       0.40      0.39      0.40       168
           3       0.56      0.42      0.48       230
           4       0.46      0.32      0.37        19
           5       0.41      0.29      0.34        42
           6       0.19      0.14      0.16        29
           7       0.25      0.32      0.28        31
           8       0.29      0.34      0.31       191
           9       0.24      0.24      0.24        38
          10       0.27      0.20      0.23        20
          11       0.38      0.40      0.39       324
          12       0.14      0.19      0.16        21
          13       0.13      0.38      0.19         8
          14       0.51      0.43      0.47       215
          15       0.32      0.32      0.32        19
          16       0.36      0.20      0.26        45
          17       0.27    

### KNN + stopwords

In [4]:
from prepare_data import get_vector

df_1 = df.copy()
stopwords = set(["а","або","б","би","бо","був","буде","була","були","було","бути","в","вам","вами","вас","ваш","ваша","ваше","вашим","вашими","ваших","ваші","вашій","вашого","вашої","вашому","вашою","вашу","вже","ви","від","він","вона","вони","воно","всі","де","для","до","дуже","є","з","за","зі","і","із","її","їй","їм","їх","й","його","йому","ким","кого","коли","кому","лише","має","мене","мені","ми","мій","мною","мого","моє","моєї","моєму","моєю","можна","мої","моїй","моїм","моїми","моїх","мою","моя","на","нам","нами","нас","наш","наша","наше","нашим","нашими","наших","наші","нашій","нашого","нашої","нашому","нашою","нашу","неї","нею","ним","ними","них","ній","нім","ну","нього","ньому","під","після","по","при","про","саме","себе","собі","та","так","також","там","твій","твого","твоє","твоєї","твоєму","твоєю","твої","твоїй","твоїм","твоїми","твоїх","твою","твоя","те","тебе","ти","тим","тими","тих","ті","тієї","тією","тій","тільки","тім","то","тобі","тобою","того","тоді","той","тому","ту","тут","у","хто","це","цей","ці","цього","цьому","через","чи","чиє","чиєї","чиєму","чиї","чиїй","чиїм","чиїми","чиїх","чий","чийого","чийому","чим","чию","чия","чого","чому","що","щоб","щодо","щось","я","як","яка","який","які","якщо"])

df_1['tokens'] = df_1['tokens'].apply(lambda tokens: [token for token in tokens if token and token not in stopwords])
df_1['vector'] = df_1['tokens'].apply(get_vector)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(list(df_1['vector']), list(df_1['category']), random_state=1, shuffle=True)

neighbours = NearestNeighbors(n_neighbors=1, metric='cosine')
neighbours.fit(X_train_1)
y_pred_1 = [y_train_1[neighbour[0]] for neighbour in neighbours.kneighbors(X_test_1, return_distance=False)]
print(classification_report(y_test_1, y_pred_1))

              precision    recall  f1-score   support

           0       0.41      0.43      0.42        60
           1       0.25      0.09      0.14        32
           2       0.42      0.39      0.40       168
           3       0.61      0.47      0.53       230
           4       0.42      0.26      0.32        19
           5       0.52      0.31      0.39        42
           6       0.47      0.31      0.38        29
           7       0.24      0.32      0.28        31
           8       0.32      0.35      0.33       191
           9       0.33      0.18      0.24        38
          10       0.44      0.20      0.28        20
          11       0.37      0.39      0.38       324
          12       0.10      0.19      0.13        21
          13       0.12      0.25      0.17         8
          14       0.49      0.47      0.48       215
          15       0.28      0.26      0.27        19
          16       0.38      0.24      0.30        45
          17       0.33    

### KNN + stopwords + lemma

In [12]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

def lemmatize(tokens):
    return [morph.parse(token)[0].normal_form for token in tokens if token]

df_2 = df_1.copy()

df_2['tokens'] = df_2['tokens'].apply(lemmatize)
df_2['vector'] = df_2['tokens'].apply(lambda tokens: get_vector(tokens, True))

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(list(df_2['vector']), list(df_2['category']), random_state=1, shuffle=True)

neighbours = NearestNeighbors(n_neighbors=1, metric='cosine')
neighbours.fit(X_train_2)
y_pred_2 = [y_train_2[neighbour[0]] for neighbour in neighbours.kneighbors(X_test_2, return_distance=False)]
print(classification_report(y_test_2, y_pred_2))

              precision    recall  f1-score   support

           0       0.40      0.45      0.43        60
           1       0.27      0.12      0.17        32
           2       0.38      0.45      0.41       168
           3       0.62      0.47      0.54       230
           4       0.62      0.26      0.37        19
           5       0.48      0.33      0.39        42
           6       0.29      0.24      0.26        29
           7       0.19      0.35      0.25        31
           8       0.29      0.39      0.33       191
           9       0.27      0.24      0.25        38
          10       0.42      0.25      0.31        20
          11       0.40      0.38      0.39       324
          12       0.22      0.38      0.28        21
          13       0.10      0.25      0.14         8
          14       0.53      0.47      0.50       215
          15       0.29      0.26      0.28        19
          16       0.34      0.24      0.29        45
          17       0.12    

### Doc2Vec

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
def tag_doc(row):
    return TaggedDocument(words=row.values.tolist()[0], tags=[row.name])

In [None]:
df_2.to_csv('./data/input2.tsv', sep='\t')

In [4]:
df_2 = pd.read_csv('./data/input2.tsv', sep='\t', index_col=0, converters={'vector': literal_eval, 'tokens': literal_eval})
df_2['docs'] = df_2[['tokens']].apply(tag_doc, axis=1)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(list(df_2['docs']), list(df_2['category']), random_state=1, shuffle=True)

In [38]:
model = Doc2Vec(vector_size=300, window=10, min_count=5, workers=8, epochs=150, seed=1)
model.build_vocab(X_train_3 + X_test_3)
model.train(X_train_3, total_examples=model.corpus_count, epochs=model.iter)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model.save('./data/d2v.model')

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
model = Doc2Vec.load('./data/d2v.model')

In [None]:
test_infer_vectors = [model.infer_vector(doc.words) for doc in X_test_3]
train_infer_vectors = [model.infer_vector(doc.words) for doc in X_train_3]

In [None]:
neighbours = NearestNeighbors(n_neighbors=1, metric='cosine')
neighbours.fit(train_infer_vectors)
y_pred_3 = [y_train_3[neighbour[0]] for neighbour in neighbours.kneighbors(test_infer_vectors, return_distance=False)]
print(classification_report(y_test_3, y_pred_3))