In [1]:
import os
import re
import pymorphy2
import pandas as pd
from rake_nltk import Rake
from sklearn import metrics
from sklearn.pipeline import Pipeline

import numpy as np

In [2]:
def process_corpus(dir_path: str, eval: str):
    results = []
    morph = pymorphy2.MorphAnalyzer()
    r_model = Rake(language='russian')
    for item in os.listdir(dir_path):
        path = dir_path + '/' + item
        if os.path.isfile(path):
            with open(path, encoding='utf-8') as f:
                text = f.read()
                text = re.split('<\d+>', text)
                for instance in text:
                    words = instance.split(' ')
                    lemmatized_text = [morph.parse(word)[0].normal_form for word in words]
                    lemmatized_text = ' '.join(lemmatized_text)
                    r_model.extract_keywords_from_text(instance)
                    raked_text = ' '.join(r_model.get_ranked_phrases())
                    results.append((instance, lemmatized_text, raked_text, eval))
    return results

In [3]:
def get_data_news(rel_path: str):
    path = os.path.abspath('') + rel_path
    for item in os.listdir(path):
        item_path = path + '/' + item
        if os.path.isdir(item_path):
            results = process_corpus(item_path, item)
            df = pd.DataFrame(results)
            df.to_csv('news_processed.csv', mode='a', index=False, header=False)

In [4]:
get_data_news('/corpus_news')

In [5]:
df = pd.read_csv('news_processed.csv', names=['Raw', 'Lemmatized', 'Rake', 'Evaluation'])

#df.head(-5)

In [6]:
evals = df['Evaluation']

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

raw_text = df['Raw']
lemmatized_text = df['Lemmatized']
raked_text = df['Rake']
evals = df['Evaluation']

raw_train, raw_test, raw_target_train, raw_target_test = train_test_split(raw_text, evals, train_size=0.5, random_state=45)
lemmatized_train, lemmatized_test, lemmatized_target_train, lemmatized_target_test = train_test_split(lemmatized_text, evals, train_size=0.5, random_state=45)
raked_train, raked_test, raked_target_train, raked_target_test = train_test_split(raked_text, evals, train_size=0.5, random_state=45)

#print(raw_train)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

clf = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', GradientBoostingClassifier())])

clf.fit(raw_train, raw_target_train)

gbc_predicted = clf.predict(raw_test)

print(metrics.classification_report(raw_target_test, gbc_predicted))

              precision    recall  f1-score   support

     culture       0.99      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      0.99      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



In [9]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

clf.fit(raw_train, raw_target_train)

rfc_predicted = clf.predict(raw_test)

print(metrics.classification_report(raw_target_test, rfc_predicted))


              precision    recall  f1-score   support

     culture       0.99      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      0.99      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



In [10]:
from sklearn.ensemble import ExtraTreesClassifier

clf = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', ExtraTreesClassifier())])

clf.fit(raw_train, raw_target_train)

efc_predicted = clf.predict(raw_test)

print(metrics.classification_report(raw_target_test, efc_predicted))


              precision    recall  f1-score   support

     culture       0.99      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      0.99      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



In [11]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

clf = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', BaggingClassifier(estimator=SVC()))])

clf.fit(raw_train, raw_target_train)

bc_predicted = clf.predict(raw_test)

print(metrics.classification_report(raw_target_test, bc_predicted))

              precision    recall  f1-score   support

     culture       0.99      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      0.99      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



In [12]:
from sklearn.ensemble import AdaBoostClassifier

clf = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', AdaBoostClassifier())])

clf.fit(raw_train, raw_target_train)

abc_predicted = clf.predict(raw_test)

print(metrics.classification_report(raw_target_test, abc_predicted))

              precision    recall  f1-score   support

     culture       0.62      0.34      0.44       831
     hi-tech       1.00      1.00      1.00      1233
    politics       0.75      0.97      0.85      2154
     science       0.97      0.76      0.85      1052

    accuracy                           0.83      5270
   macro avg       0.84      0.77      0.78      5270
weighted avg       0.83      0.83      0.82      5270



In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42))]

clf = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))])

clf.fit(raw_train, raw_target_train)

stackclass_predicted = clf.predict(raw_test)

print(metrics.classification_report(raw_target_test, stackclass_predicted))

              precision    recall  f1-score   support

     culture       0.99      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      0.99      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = MultinomialNB()


eclf1 = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3)], voting='hard'))])

eclf1.fit(raw_train, raw_target_train)

vc1_predicted = eclf1.predict(raw_test)

print(metrics.classification_report(raw_target_test, vc1_predicted))

              precision    recall  f1-score   support

     culture       0.99      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      0.99      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = MultinomialNB()


eclf2 = Pipeline([('vect', CountVectorizer()),
                ('tfdif', TfidfTransformer()),
                ('clf', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3)], voting='soft'))])

eclf2.fit(raw_train, raw_target_train)

vc2_predicted = eclf2.predict(raw_test)

print(metrics.classification_report(raw_target_test, vc2_predicted))

              precision    recall  f1-score   support

     culture       1.00      1.00      1.00       831
     hi-tech       1.00      1.00      1.00      1233
    politics       1.00      1.00      1.00      2154
     science       1.00      1.00      1.00      1052

    accuracy                           1.00      5270
   macro avg       1.00      1.00      1.00      5270
weighted avg       1.00      1.00      1.00      5270



------

In [36]:
classifiers = {
    'AdaBoostClassifier': abc_predicted,
    'BaggingClassifier': bc_predicted,
    'ExtraTreesClassifier': efc_predicted
}

In [46]:
raw_target_test_array = raw_target_test.values

In [49]:
reports = {}
for name, predicted in classifiers.items():
    report = metrics.classification_report(raw_target_test_array, predicted, output_dict=True)
    reports[name] = report

In [50]:
reports

{'AdaBoostClassifier': {'culture': {'precision': 0.6160520607375272,
   'recall': 0.3417569193742479,
   'f1-score': 0.43962848297213625,
   'support': 831},
  'hi-tech': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 1233},
  'politics': {'precision': 0.7532608695652174,
   'recall': 0.9651810584958217,
   'f1-score': 0.8461538461538463,
   'support': 2154},
  'science': {'precision': 0.9742647058823529,
   'recall': 0.7557034220532319,
   'f1-score': 0.8511777301927195,
   'support': 1052},
  'accuracy': 0.8332068311195446,
  'macro avg': {'precision': 0.8358944090462744,
   'recall': 0.7656603499808254,
   'f1-score': 0.7842400148296755,
   'support': 5270},
  'weighted avg': {'precision': 0.8334705210824666,
   'recall': 0.8332068311195446,
   'f1-score': 0.8190485059066358,
   'support': 5270}},
 'BaggingClassifier': {'culture': {'precision': 0.992831541218638,
   'recall': 1.0,
   'f1-score': 0.9964028776978416,
   'support': 831},
  'hi-tech': {'precision