In [None]:
# Задание 1.

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import gensim
import pandas as pd
import numpy as np
from pymorphy3 import MorphAnalyzer
import pyLDAvis.gensim_models
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
morph = MorphAnalyzer()

In [4]:
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [5]:
data = pd.read_csv('avito_category_classification.csv')

In [6]:
data['description_norm'] = data['description'].apply(normalize)

In [7]:
def eval_table(X, y, pipeline, N=6):
    labels = list(set(y))
    fold_metrics = pd.DataFrame(index=labels)   
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])

        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)

    result = pd.DataFrame(index=labels) #result = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2) #result['mean'] = result.mean().round(2)
    result.loc['mean'] = result.mean().round(2) #
    
    return result.loc['mean']['f1'] # return result

In [8]:
# SGDClassifier:

#NMF
pipeline_SGD_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=2, max_df=0.5)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', SGDClassifier(alpha=1e-2, max_iter=1000))
])

#SVD
pipeline_SGD_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3))
])

#LDA
pipeline_SGD_lda = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=4, max_df=0.3)),
    ('lda', LatentDirichletAllocation(50)),
    ('clf', SGDClassifier(max_iter=1500, tol=1e-3))
])

In [9]:
# KNeighborsClassifier

#NMF
pipeline_KN_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=2, max_df=0.5)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])

#SVD
pipeline_KN_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier(n_neighbors=3))
])


#LDA
pipeline_KN_lda = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=4, max_df=0.3)),
    ('lda', LatentDirichletAllocation(50)),
    ('clf', KNeighborsClassifier(n_neighbors=3))
])

In [10]:
# RandomForest

#NMF
pipeline_RF_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=2, max_df=0.5)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', RandomForestClassifier(n_estimators=100))
])

#SVD
pipeline_RF_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=150))
])

#LDA
pipeline_RF_lda = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=4, max_df=0.3)),
    ('lda', LatentDirichletAllocation(50)),
    ('clf', RandomForestClassifier(n_estimators=100))
])

In [11]:
# ExtraTreesClassifier

#NMF
pipeline_ET_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', ExtraTreesClassifier(n_estimators=100))
])

#SVD
pipeline_ET_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier(n_estimators=150))
])

#LDA
pipeline_ET_lda = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=4, max_df=0.3)),
    ('lda', LatentDirichletAllocation(50)),
    ('clf', ExtraTreesClassifier(n_estimators=100))
])

In [12]:
def eval_table(X, y, pipeline, N=6):
    labels = list(set(y))
    fold_metrics = pd.DataFrame(index=labels)
    kfold = StratifiedKFold(n_splits=N, shuffle=True)
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)

    return (fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1)).mean()

In [13]:
SGD_nmf = eval_table(data['description_norm'], data['category_name'],pipeline_SGD_nmf)
KN_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_KN_nmf)
RF_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_RF_nmf)
ET_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_ET_nmf)

In [15]:
SGD_svd = eval_table(data['description_norm'], data['category_name'],pipeline_SGD_svd)
KN_svd = eval_table(data['description_norm'], data['category_name'], pipeline_KN_svd)
RF_svd = eval_table(data['description_norm'], data['category_name'], pipeline_RF_svd)
ET_svd = eval_table(data['description_norm'], data['category_name'], pipeline_ET_svd)

In [16]:
SGD_lda = eval_table(data['description_norm'], data['category_name'],pipeline_SGD_lda)
KN_lda = eval_table(data['description_norm'], data['category_name'], pipeline_KN_lda)
RF_lda = eval_table(data['description_norm'], data['category_name'], pipeline_RF_lda)
ET_lda = eval_table(data['description_norm'], data['category_name'], pipeline_ET_lda)

In [32]:
res_NMF = {'SGD_nmf': SGD_nmf, 'KN_nmf': KN_nmf, 'RF_nmf': RF_nmf, 'ET_nmf': ET_nmf}
NMF_DF = pd.DataFrame.from_dict(res_NMF, orient='index', columns=['F1-mean'])

res_SVD = {'SGD_svd': SGD_svd,'KN_svd': KN_svd, 'RF_svd': RF_svd,'ET_svd': ET_svd}
SVD_DF = pd.DataFrame.from_dict(res_SVD, orient='index', columns=['F1-mean'])

res_LDA = {'SGD_lda': SGD_lda, 'KN_lda': KN_lda, 'RF_lda': RF_lda, 'ET_lda': ET_lda}
LDA_DF = pd.DataFrame.from_dict(res_LDA, orient='index', columns=['F1-mean'])

In [33]:
results_dict = {'SGD_nmf': SGD_nmf, 'KN_nmf': KN_nmf, 'RF_nmf': RF_nmf, 'ET_nmf': ET_nmf,
                'SGD_svd': SGD_svd,'KN_svd': KN_svd, 'RF_svd': RF_svd,'ET_svd': ET_svd,
                'SGD_lda': SGD_lda, 'KN_lda': KN_lda, 'RF_lda': RF_lda, 'ET_lda': ET_lda}
results=pd.DataFrame.from_dict(results_dict, orient='index', columns=['F1-mean'])

In [34]:
results

Unnamed: 0,F1-mean
SGD_nmf,0.406065
KN_nmf,0.476453
RF_nmf,0.697971
ET_nmf,0.706481
SGD_svd,0.733744
KN_svd,0.452269
RF_svd,0.507278
ET_svd,0.416894
SGD_lda,0.56509
KN_lda,0.529321


In [35]:
results.sort_values('F1-mean')

Unnamed: 0,F1-mean
SGD_nmf,0.406065
ET_svd,0.416894
KN_svd,0.452269
KN_nmf,0.476453
RF_svd,0.507278
KN_lda,0.529321
SGD_lda,0.56509
RF_lda,0.587621
ET_lda,0.591885
RF_nmf,0.697971
