# Libraries


In [None]:
!pip install pymorphy2
!pip install natasha
!pip install -U sentence-transformers

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m932.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=3bca8557825d356b3951e6458dee011304c07111a614be670a7afa0a6119e0f2
  Stored in directory: /root

In [None]:
import numpy as np
from tabulate import tabulate
import pandas as pd
import difflib
import re
from tqdm.autonotebook import tqdm
import copy
import os
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import difflib
from sentence_transformers import SentenceTransformer
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
pd.set_option('display.max_columns', None)

  from tqdm.autonotebook import tqdm


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Preparing Topics


In [None]:
def topics(file_path):
    file_name = file_path
    df = pd.DataFrame()
    excel = pd.ExcelFile(file_name)
    sheets = excel.sheet_names
    topic_names = []
    base_words = []
    advanced_words = []
    for sheet_name in sheets:
        sheet = pd.read_excel(excel, sheet_name=sheet_name)
        base = np.array(sheet.loc[:, sheet.columns.str.startswith('Базовый')]).flatten()
        string_base = [str(item) for item in base if str(item).strip() != ""]
        string_base = ', '.join(string_base)
        base_words.append(string_base)

        advanced = np.array(sheet.loc[:, sheet.columns.str.startswith('Продвинутый')]).flatten()
        string_advanced = [str(item) for item in advanced if str(item).strip() != ""]
        string_advanced = ', '.join(string_advanced)
        advanced_words.append(string_advanced)

        topic_names.append(sheet_name)

    df.insert(0, "Topic", topic_names)
    df.insert(1, "Base Words", base_words)
    df.insert(2, "Advanced Words", advanced_words)
    return df

In [None]:
def find_important_words(df):
    important_words = []
    df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: x.split(', '))
    for i in range(len(df)):
        words = []
        if df["Topic"].iloc[i].startswith("!"):
            important_words.append(list(set(df["Base Words"].iloc[i] + df["Advanced Words"].iloc[i])))
        else:
            for word in df["Base Words"].iloc[i]:
                if word.startswith("!"):
                    words.append(word)
            important_words.append(list(set(words)))

    df["!"] = important_words

    df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: ', '.join(map(str, x)))
    df = df.applymap(lambda x: x.replace('+', '').replace('!', '').replace("nan, ", "").replace(", nan", "").replace("nan", "").replace(" ,", ",") if isinstance(x, str) else x)
    df.iloc[:, 1:] = df.iloc[:, 1:].applymap(lambda x: x.split(', '))
    return df

In [None]:
def del_duplicates(df):
    for i in range(len(df)):
        intersection = set(df["Base Words"].iloc[i]).intersection(set(df["Advanced Words"].iloc[i]))
        if intersection:
            df['Base Words'] = df['Base Words'].apply(lambda x: [word for word in x if word not in intersection])
        else:
            continue
    return df

In [None]:
def add_weights(df):
    weights_base = []
    weights_advanced = []

    words = []
    for sublist1, sublist2 in zip(df["Base Words"], df["Advanced Words"]):
        if sublist2[0] == "":
            words.append(list(set(sublist1)))
        else:
            combined_sublist = sublist1 + sublist2
            words.append(list(set(combined_sublist)))
    df["Words"] = words

    for words in df["Base Words"]:
        weights = [1] * len(set(words))
        weights_base.append(weights)
    for words in df["Advanced Words"]:
        if words[0] == "":
            weights_advanced.append([])
            continue
        else:
            weights = [5] * len(set(words))
            weights_advanced.append(weights)
    weights = []
    for sublist1, sublist2 in zip(weights_base, weights_advanced):
        combined_sublist = sublist1 + sublist2
        weights.append(combined_sublist)
    df["Weights"] = weights

    words_with_weights = []
    for sublist1, sublist2, sublist3 in zip(df['Words'], df["Weights"], df["!"]):
        combined_sublist = [set(sublist1), sublist2, sublist3]
        words_with_weights.append(combined_sublist)
    return df, words_with_weights

In [None]:
Environmental = topics("/content/dictionaries/RU Dictionary_AI-Powered ESG Rating - Topics Environmental.xlsx")
Social = topics("/content/dictionaries/RU Dictionary_AI-Powered ESG Rating - Topics Social.xlsx")
Governance = topics("/content/dictionaries/RU Dictionary_AI-Powered ESG Rating - Topics Governance.xlsx")

Environmental = find_important_words(Environmental)
Social = find_important_words(Social)
Governance = find_important_words(Governance)

Environmental = del_duplicates(Environmental)
Social = del_duplicates(Social)
Governance = del_duplicates(Governance)

Environmental, weights_E = add_weights(Environmental)
Social, weights_S = add_weights(Social)
Governance, weights_G = add_weights(Governance)

In [None]:
topic_E= dict(zip(Environmental['Topic'], weights_E))
topic_S= dict(zip(Social['Topic'], weights_S))
topic_G= dict(zip(Governance['Topic'], weights_G))
topics = [topic_E, topic_S, topic_G]

# Preparing News Dataset

Companies to find

In [None]:
companies = pd.read_excel('/content/companies/Рейтинг ESG.xlsx', sheet_name=1, header=2)
companies = np.array(companies["Бренд и Юрлицо (!)"])

In [None]:
def format_company_name(company):
    if '"Открытие"' in company:
        return ['банк открытие']
    if '«Интернет Решения (OZON.RU)»' in company:
        return ['ozon', 'озон']
    if 'Тинькофф Банк' in company:
        return ['Тинькофф', 'Tinkoff']
    if '«Титан», группа компаний (нефтехимия)' in company:
        return '«Титан», группа компаний (нефтехимия)'.strip().split("\\")
    if '"Титан", группа компаний (деревообработка)' in company:
        return '"Титан", группа компаний (деревообработка)'.strip().split("\\")
    company = company.replace('«', '').replace('»', '').replace('"', '').replace('”', '').replace('“', '')
    company = re.sub(r',.*$', '', company)
    company = re.sub(r'\s*\(.*?\)\s*', '', company)
    company = re.sub(r'\bПАО\b', '', company)
    company = re.sub(r'\bАО\b', '', company)
    #company = re.sub(r'\bGroup\b', '', company)
    #company = re.sub(r'\bГрупп\b', '', company)
    company = re.sub(r'\b \u200eЦентр-Инвест\b', '', company)
    company = re.sub(r'\bЗолоторудная Компания\b', '', company)
    company = re.sub(r'\b - Российские авиалинии\b', '', company)
    company = re.sub(r'\bАкционерная компания \b', '', company)
    #company = re.sub(r'\bГК\b', '', company)
    if 'ЛУГАПОРТ LUGAPORT' in company:
        return company.strip().split()
    return company.strip().split("\\")

# Создаем словарь с форматированными названиями компаний
companies_dict = {company: format_company_name(company) for company in companies}
companies_dict['Segezha Group'].append("Сегежа Групп")
companies_dict['Segezha Group'].append("Сегёжа Групп")
companies_dict['СБЕРБАНК'].append("Sberbank")
companies_dict['СБЕРБАНК'].append("Sber")
companies_dict['СБЕРБАНК'].append("Сбер")
companies_dict['«Яндекс»'].append("Yandex")
companies_dict['VK'].append("Вконтакте")
companies_dict['VK'].append("Vkontakte")
companies_dict['«М.Видео-Эльдорадо», группа'].append("М.Видео")
companies_dict['«М.Видео-Эльдорадо», группа'].append("Эльдорадо")
companies_dict['«Киви Банк»'].append("киви")
companies_dict['«Киви Банк»'].append("qiwi")
companies_dict['Билайн (Вымпелком СЗФО)'].append("Вымпелком")
# Выводим результат
companies_dict

{'Адмиралтейские верфи': ['Адмиралтейские верфи'],
 'Алмаз-Антей': ['Алмаз-Антей'],
 'Армалит': ['Армалит'],
 'Аттика ': ['Аттика'],
 'Балтийский завод': ['Балтийский завод'],
 'Бронка Групп': ['Бронка Групп'],
 'Всеволожский крановый завод': ['Всеволожский крановый завод'],
 'Гидроприбор': ['Гидроприбор'],
 'Гранит-Электрон': ['Гранит-Электрон'],
 'Завод «Измерон»': ['Завод Измерон'],
 'Императорский Фарфоровый завод': ['Императорский Фарфоровый завод'],
 'Кировский завод': ['Кировский завод'],
 'Корпорация морского приборостроения': ['Корпорация морского приборостроения'],
 'Метеор лифт (бывший Отис)': ['Метеор лифт'],
 'Ситроникс': ['Ситроникс'],
 'Транспак': ['Транспак'],
 'Центр судоремонта "Звёздочка"': ['Центр судоремонта Звёздочка'],
 'Электроприбор': ['Электроприбор'],
 '«Группа ПОЛИПЛАСТИК»': ['Группа ПОЛИПЛАСТИК'],
 '«Упаковочные системы»': ['Упаковочные системы'],
 'ЭкопрофХим': ['ЭкопрофХим'],
 '«ФосАгро»': ['ФосАгро'],
 '«Уралкалий»': ['Уралкалий'],
 '«Уралхим», ОХК': ['У

Concat and prepare dataset

In [None]:
df1 = pd.read_csv("/content/news/ESG_test_dataset_first_part.csv")
df2 = pd.read_csv("/content/news/ESG_test_dataset_second_part.csv")
df3 = pd.read_csv("/content/news/ESG_test_dataset_1000.csv")
df4 = pd.read_csv("/content/news/ESG_test_dataset_first_partt.csv")
df5 = pd.read_csv("/content/news/ESG_test_dataset_second_partt.csv")
df6 = pd.read_csv("/content/news/MOEX_1.csv")
df7 = pd.read_csv("/content/news/MOEX_2.csv")
df8 = pd.read_csv("/content/news/MOEX_3.csv")


df = pd.concat([df1, df2]).reset_index(drop=True)
df = pd.concat([df, df3]).reset_index(drop=True)
df = pd.concat([df, df4]).reset_index(drop=True)
df = pd.concat([df, df5]).reset_index(drop=True)
df = pd.concat([df, df6]).reset_index(drop=True)
df = pd.concat([df, df7]).reset_index(drop=True)
df = pd.concat([df, df8]).reset_index(drop=True)


#df.dropna(inplace=True)
df = df.drop(['source', 'data', 'visibility', 'media_index', 'metainfo', 'main_part'], axis = 1)
df = df.drop_duplicates(subset=['header'])
df.dropna(inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,header,text
0,"""Аэрофлот"" изменил схему захода на посадку в М...",Ее скорректировали при приземлении во Внукове ...
1,Абитуриентов из Татарстана приглашают пройти о...,"МТС, цифровая экосистема, и Российский институ..."
2,Масштабная конференция True Tech Day снова соб...,"ПАО ""МТС"" (MOEX: MTSS), цифровая экосистема, о..."
3,В Шереметьево и Внуково изменили схемы заходов...,"""Аэрофлот"" и Госкорпорация по аэронавигации (п..."
4,"""Газпром"" не нужен: ЕС завершил отопительный с...","Несмотря на все попытки Кремля ""заморозить Евр..."
...,...,...
5482,РУСАЛ зажигает звезды,В 2016 году основатель РУСАЛа Олег Дерипаска п...
5483,Химик и металлург: лучшего работника месяца на...,Химик и металлург в одной профессии. А еще кон...
5484,Через Владивосток отправляется первая партия с...,"Первая партия свинины, после того, как в сентя..."
5485,Самый мощный суперкомпьютер будет создан в России,На базе строящегося в Саратовской области цент...


Define companies

In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsNERTagger,
    Doc
)

segmenter = Segmenter()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

def find_companies(df, companies_dict = companies_dict):
    results = []
    for news in tqdm(np.array(df.text + df.header)):
        doc = Doc(news)
        doc.segment(segmenter)
        doc.tag_ner(ner_tagger)

        found_companies = []
        for span in doc.spans:
            span.normalize(morph_vocab)
            if span.type == 'ORG':
                for company_name, keyword in companies_dict.items():
                    for item in keyword:
                        if item.lower() in span.normal.lower() and item.lower() != "газ":
                            found_companies.append(company_name)
                        if item.lower() == "газ":
                            if span.normal.lower() in item.lower():
                                found_companies.append(company_name)

        count = Counter(found_companies)
        filtered = [company for company in found_companies if count[company] > 1]

        if filtered:
            results.append("; ".join(set(filtered)))
        else:
            results.append("No company")

    df["company"] = results

    df['combined_text'] = df['header'] + ' ' + df['text']

    for index, row in df.iterrows():
        if row['company'] == 'no company':
            combined_text = row['combined_text']
            for company, variations in companies_dict.items():
                count = sum(len(re.findall(r'\b' + re.escape(variation) + r'\b', combined_text, re.IGNORECASE)) > 0 for variation in variations)
                if count > 1:
                    df.at[index, 'company'] = company
                    break
    return df

df = find_companies(df)
df = df[df.company != 'No company']
df = df.reset_index(drop=True)
df

  0%|          | 0/5487 [00:00<?, ?it/s]

Unnamed: 0,header,text,company,combined_text
0,"""Аэрофлот"" изменил схему захода на посадку в М...",Ее скорректировали при приземлении во Внукове ...,«Аэрофлот - Российские авиалинии»,"""Аэрофлот"" изменил схему захода на посадку в М..."
1,Абитуриентов из Татарстана приглашают пройти о...,"МТС, цифровая экосистема, и Российский институ...",МТС,Абитуриентов из Татарстана приглашают пройти о...
2,Масштабная конференция True Tech Day снова соб...,"ПАО ""МТС"" (MOEX: MTSS), цифровая экосистема, о...",МТС,Масштабная конференция True Tech Day снова соб...
3,"""Газпром"" не нужен: ЕС завершил отопительный с...","Несмотря на все попытки Кремля ""заморозить Евр...",«Газпром»,"""Газпром"" не нужен: ЕС завершил отопительный с..."
4,Российские авиакомпании в весенне-летний перио...,"Российские авиаперевозчики ""Победа"", ""Аэрофлот...",«Аэрофлот - Российские авиалинии»,Российские авиакомпании в весенне-летний перио...
...,...,...,...,...
3198,"""Полюс"" вернул равновесие на рынок сурьмы",Месторождение Олимпиада золотодобывающей компа...,«Полюс»,"""Полюс"" вернул равновесие на рынок сурьмы Мест..."
3199,РУСАЛ зажигает звезды,В 2016 году основатель РУСАЛа Олег Дерипаска п...,"«Русал», объединенная компания",РУСАЛ зажигает звезды В 2016 году основатель Р...
3200,Химик и металлург: лучшего работника месяца на...,Химик и металлург в одной профессии. А еще кон...,МАГНИТ,Химик и металлург: лучшего работника месяца на...
3201,Через Владивосток отправляется первая партия с...,"Первая партия свинины, после того, как в сентя...","«Русагро», группа компаний",Через Владивосток отправляется первая партия с...


Devide by paragraphs

In [None]:
import pandas as pd

# Разбиение текста на обзацы и расширение DataFrame
def expand_by_paragraphs(row, min_word_count=20):
    paragraphs = row['text'].split('\n')
    final_paragraphs = []
    temp_paragraph = ""

    for paragraph in paragraphs:
        if len(temp_paragraph.split()) > 0:
            # Если во временной переменной уже есть текст, добавляем к нему текущий абзац
            paragraph = temp_paragraph + " " + paragraph

        if len(paragraph.split()) < min_word_count:
            # Если абзац слишком короткий, сохраняем его во временной переменной для дальнейшего объединения
            temp_paragraph = paragraph
        else:
            # Если абзац достаточно длинный, добавляем его в список и очищаем временную переменную
            final_paragraphs.append(paragraph)
            temp_paragraph = ""

    # Проверяем, остался ли необработанный текст во временной переменной после завершения цикла
    if len(temp_paragraph.split()) > 0:
        final_paragraphs.append(temp_paragraph)

    return pd.DataFrame({
        'header': [row['header']] * len(final_paragraphs),
        'paragraph': final_paragraphs,
        'company': [row['company']] * len(final_paragraphs)
    })

# Расширяем исходный DataFrame
df_paragraph = pd.concat([expand_by_paragraphs(row, min_word_count=20) for index, row in df.iterrows()], ignore_index=True)
df_paragraph

Unnamed: 0,header,paragraph,company
0,"""Аэрофлот"" изменил схему захода на посадку в М...",Ее скорректировали при приземлении во Внукове ...,«Аэрофлот - Российские авиалинии»
1,"""Аэрофлот"" изменил схему захода на посадку в М...","Такая мера сократит и ""негативное влияние угле...",«Аэрофлот - Российские авиалинии»
2,Абитуриентов из Татарстана приглашают пройти о...,"МТС, цифровая экосистема, и Российский институ...",МТС
3,Абитуриентов из Татарстана приглашают пройти о...,В этом году предварительный отбор абитуриентов...,МТС
4,Абитуриентов из Татарстана приглашают пройти о...,Желающим принять участие в отборочном туре на ...,МТС
...,...,...,...
25992,Через Владивосток отправляется первая партия с...,"Вся Находка (всянаходка.рф), Находка, 14 марта...","«Русагро», группа компаний"
25993,Самый мощный суперкомпьютер будет создан в России,На базе строящегося в Саратовской области цент...,СБЕРБАНК
25994,Самый мощный суперкомпьютер будет создан в России,В декабре 2020 года между правительством облас...,СБЕРБАНК
25995,Самый мощный суперкомпьютер будет создан в России,Площадь технологических модулей составит приме...,СБЕРБАНК


In [None]:
def check_companies(row, company_dict = companies_dict):
    text = (row['header'] + ' ' + row['paragraph']).lower()
    companies = row['company'].split(", ")
    matched_companies = [company for company in companies if company_dict.get(company, '').lower() in text]
    if len(matched_companies) == 0:
        return 'No company'
    else:
        return ', '.join(matched_companies)

df_paragraph['company'] = df_paragraph.apply(check_companies, axis=1)
df_paragraph = df_paragraph[df_paragraph.company != 'No company']

In [None]:
def check_companies(row):
    text = (row['header'] + ' ' + row['paragraph']).lower()
    companies = row['company'].split(", ")
    matched_companies = [company for company in companies if company.lower() in text]
    if len(matched_companies) == 0:
        return 'No company'
    else:
        return ', '.join(matched_companies)

df_paragraph['company'] = df_paragraph.apply(check_companies, axis=1)
df_paragraph = df_paragraph[df_paragraph.company != 'No company']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paragraph['company'] = df_paragraph.apply(check_companies, axis=1)


In [None]:
df_paragraph

Unnamed: 0,header,paragraph,company
0,"""Аэрофлот"" изменил схему захода на посадку в М...",Ее скорректировали при приземлении во Внукове ...,«Аэрофлот - Российские авиалинии»
1,"""Аэрофлот"" изменил схему захода на посадку в М...","Такая мера сократит и ""негативное влияние угле...",«Аэрофлот - Российские авиалинии»
2,Абитуриентов из Татарстана приглашают пройти о...,"МТС, цифровая экосистема, и Российский институ...",МТС
3,Абитуриентов из Татарстана приглашают пройти о...,В этом году предварительный отбор абитуриентов...,МТС
4,Абитуриентов из Татарстана приглашают пройти о...,Желающим принять участие в отборочном туре на ...,МТС
...,...,...,...
25992,Через Владивосток отправляется первая партия с...,"Вся Находка (всянаходка.рф), Находка, 14 марта...","«Русагро», группа компаний"
25993,Самый мощный суперкомпьютер будет создан в России,На базе строящегося в Саратовской области цент...,СБЕРБАНК
25994,Самый мощный суперкомпьютер будет создан в России,В декабре 2020 года между правительством облас...,СБЕРБАНК
25995,Самый мощный суперкомпьютер будет создан в России,Площадь технологических модулей составит приме...,СБЕРБАНК


In [None]:
df_paragraph.to_csv("paragraphs_new.csv")

Val Dataset

In [None]:
RE_RUSSIAN_TEXT = re.compile("[а-яА-Я]+")
EXCLUDE_PATTERNS = re.compile("[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+")
RE_DERIVED = re.compile("\w+( -|- |-|! - )\w+")

stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

In [None]:
def clean_texts(df):
    df.dropna(axis=0, inplace=True)
    cleaned_text = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        report_page = RE_DERIVED.sub("", row["header"]+" "+row["paragraph"])
        report_page_lst = word_tokenize(report_page)
        tokens = []
        for word_ in filter(RE_RUSSIAN_TEXT.match, report_page_lst):
            if word_ and word_ not in stopwords_ru:
                word_ = word_.strip()
                word_ = morph.normal_forms(word_)[0]
                tokens.append(word_)
        cleaned_text.append(" ".join(tokens))
    df["cleaned_texts"] = cleaned_text
    return df

In [None]:
df_paragraph = clean_texts(df_paragraph)
df_paragraph

  0%|          | 0/25997 [00:00<?, ?it/s]

Unnamed: 0,header,paragraph,company,cleaned_texts
0,"""Аэрофлот"" изменил схему захода на посадку в М...",Ее скорректировали при приземлении во Внукове ...,«Аэрофлот - Российские авиалинии»,аэрофлот изменить схема заход посадка москва о...
1,"""Аэрофлот"" изменил схему захода на посадку в М...","Такая мера сократит и ""негативное влияние угле...",«Аэрофлот - Российские авиалинии»,аэрофлот изменить схема заход посадка москва т...
2,Абитуриентов из Татарстана приглашают пройти о...,"МТС, цифровая экосистема, и Российский институ...",МТС,абитуриент татарстан приглашать пройти отбор г...
3,Абитуриентов из Татарстана приглашают пройти о...,В этом году предварительный отбор абитуриентов...,МТС,абитуриент татарстан приглашать пройти отбор г...
4,Абитуриентов из Татарстана приглашают пройти о...,Желающим принять участие в отборочном туре на ...,МТС,абитуриент татарстан приглашать пройти отбор г...
...,...,...,...,...
25992,Через Владивосток отправляется первая партия с...,"Вся Находка (всянаходка.рф), Находка, 14 марта...","«Русагро», группа компаний",через владивосток отправляться первый партия с...
25993,Самый мощный суперкомпьютер будет создан в России,На базе строящегося в Саратовской области цент...,СБЕРБАНК,самый мощный суперкомпьютер создать россия на ...
25994,Самый мощный суперкомпьютер будет создан в России,В декабре 2020 года между правительством облас...,СБЕРБАНК,самый мощный суперкомпьютер создать россия в д...
25995,Самый мощный суперкомпьютер будет создан в России,Площадь технологических модулей составит приме...,СБЕРБАНК,самый мощный суперкомпьютер создать россия пло...


In [None]:
#df = pd.read_excel("/content/true_topics.xlsx")
df = clean_texts(df)
df

  0%|          | 0/1076 [00:00<?, ?it/s]

Unnamed: 0,topic,text,cleaned_texts
0,Климат,"В рамках стратегии устойчивого развития, комп...",в рамка стратегия устойчивый развитие компания...
1,Климат,В рамках своей долгосрочной экологической стр...,в рамка свой долгосрочный экологический страте...
2,Климат,Недавние экологические катастрофы на территор...,недавний экологический катастрофа территория р...
3,Климат,Роснефть получила штраф в размере 10 миллионо...,роснефть получить штраф размер миллион доллар ...
4,Климат,Роснефть объявила о планах по развитию произв...,роснефть объявить план развитие производство б...
...,...,...,...
1124,Отношения с инвесторами,Ведущий российский интернет-ритейлер Озон объя...,ведущий российский озон объявить привлечение с...
1125,Отношения с инвесторами,Озон столкнулся с уменьшением инвестиций\n,озон столкнуться уменьшение инвестиция
1126,Отношения с инвесторами,После неудовлетворительной финансовой отчетнос...,после неудовлетворительный финансовый отчётнос...
1127,Отношения с инвесторами,Озон обнародовал планы по вторичному размещен...,озон обнародовать план вторичный размещение акция


# Define topics

Cosine Similarity

In [None]:
def weighted_cosine_similarity(vector1, vector2, weights = None):
    if weights is None:
        weights = [1] * len(vector1)

    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    weights = np.array(weights)

    if len(vector1) != len(vector2) or len(vector1) != len(weights):
        raise ValueError("Размерности векторов и весов должны совпадать.")

    if np.all(vector1 == 0):
        return 0

    if np.all(vector2 == 0):
        return 0

    enumerator = np.dot(weights, np.dot(vector1, vector2))
    squared1, squared2 = np.square(vector1), np.square(vector2)
    dot1, dot2 = np.dot(weights, squared1), np.dot(weights, squared2)
    denominator = np.sqrt(dot1) * np.sqrt(dot2)
    weighted_cosine_sim = enumerator / denominator

    return round(weighted_cosine_sim[0], 5)

Jaccard coefficient

In [None]:
def jaccard_coef(vector1, vector2):
    if len(vector1) != len(vector2):
        raise ValueError("Размерности векторов и весов должны совпадать.")
    intersection = np.intersect1d(np.nonzero(vector1)[0], np.nonzero(vector2)[0])
    union = np.union1d(np.nonzero(vector1)[0], np.nonzero(vector1)[0])
    jaccard_coef = len(intersection) / len(union)
    return jaccard_coef

Pearson correlation

In [None]:
def weighted_pearson_correlation(x, y, weights = None):
    if weights is None:
        weights = [1] * len(x)
    if len(x) != len(y) or len(x) != len(weights):
        raise ValueError("Размеры векторов не совпадают")

    mean_x = np.average(x)
    mean_y = np.average(y)

    covariance = np.sum(weights * (x - mean_x) * (y - mean_y)) / np.sum(weights)
    var_x = np.sum(weights * (x - mean_x)**2) / np.sum(weights)
    var_y = np.sum(weights * (y - mean_y)**2) / np.sum(weights)

    weighted_pearson_corr = covariance / np.sqrt(var_x * var_y)

    return abs(weighted_pearson_corr)

In [None]:
def pearson_corr(vector1, vector2):
    if len(vector1) != len(vector2):
        raise ValueError("Размерности векторов и весов должны совпадать.")
    return abs(np.corrcoef(vector1, vector2)[0, 1])

Euclidian distance

In [None]:
def euclidian_dist(vector1, vector2):
    if len(vector1) != len(vector2):
        raise ValueError("Размерности векторов и весов должны совпадать.")
    return np.linalg.norm(vector1 - vector2)

Define

In [None]:
def max_topic(df, topics):
    result = []
    df.reset_index(drop=True, inplace=True)
    for topic in topics:
        re_dict = {}
        sheets_dict = {}
        base_dict = {}
        word_list = []
        weights = {}
        importance = {}

        for idx, row in topic.items():
            sheets_dict[idx] = {r: 1 for r in row[0]}
            base_dict[idx] = {r: 0 for r in row[0]}
            weights[idx] = row[1]
            re_dict[idx] = re.compile("(" + "(?!\w)|".join(row[0]) + "(?!\w))")
            word_list.extend([{"topic": idx, "word": x, "type": 1} for x in row[0]])

        topic_scores = []
        for text in tqdm(df["cleaned_texts"].tolist()):
            paragraph_dict = copy.deepcopy(base_dict)
            found_words = False
            for key, regex in re_dict.items():
                importance[key] = False
                words = regex.findall(text)
                for item in words[:]:  # Используем срез [:], чтобы создать копию списка
                    if isinstance(item, tuple):
                        words.append(item[0])
                        words.remove(item)
                for word in words:
                    paragraph_dict[key][word] = 1
                    found_words = True
                    if word in topic[key][2]:
                        importance[key] = True


            res = {}
            if not found_words:
                res = {key: 0 for key in sheets_dict}
                topic_scores.append(res | {"max_score": 0, "max_topic": 0})
                continue

            for key, vector in sheets_dict.items():
                if importance[key]:
                    sim = weighted_cosine_similarity(list(vector.values()), list(paragraph_dict[key].values()), weights[key])
                    res[key] = sim
                else:
                    sim = 0
                    res[key] = sim

            topic_scores.append(res | {"max_score": max(res.values()), "max_topic": max(res, key=lambda k: res[k])})

        result.append(pd.DataFrame(topic_scores))
    df_scores = df.join(result[0], rsuffix="E").join(result[1], rsuffix="S").join(result[2], rsuffix="G")
    return df_scores, result

In [None]:
df_cos, _ = max_topic(df_paragraph, topics)

  0%|          | 0/25997 [00:00<?, ?it/s]

  0%|          | 0/25997 [00:00<?, ?it/s]

  0%|          | 0/25997 [00:00<?, ?it/s]

In [None]:
def get_max(row):
    max_val = max(row['max_scoreE'], row['max_scoreS'], row['max_scoreG'])
    if max_val == row['max_scoreE']:
        return row['max_topicE']
    elif max_val == row['max_scoreS']:
        return row['max_topicS']
    else:
        return row['max_topicG']

In [None]:
def result(df):
    df.rename(columns={'max_score': 'max_scoreE'}, inplace=True)
    df.rename(columns={'max_topic': 'max_topicE'}, inplace=True)
    df['max_topicESG'] = df.apply(get_max, axis=1)
    #df = df.drop(["header", "source", "data", "visibility", "media_index", "metainfo", "cleaned_texts"], axis=1)
    #new_df2.to_excel('test_sample_yandex.xlsx', index=False)
    return df

In [None]:
df_cos = result(df_cos)
df_cos

Unnamed: 0,header,paragraph,company,cleaned_texts,Экология в целом,Климат,Энергия,Воздух,Вода,Отходы и циклическая экономика,Биоразнообразие,Рекультивация земель,Экологичность продукта,max_scoreE,max_topicE,Персонал в целом,Обучение и развитие,Сотрудники. Вовлеченность и мот,Оплата труда,Сотрудники. Здоровье и благопол,Сотрудники. Наем и увольнение,Сотрудники. Корпоративная культ,Сотрудники. Безопасность и охра,Сотрудники. Профсоюз и Коллекти,Потребители. Доступность,Потребители. Сервис и коммуника,Потребители. Персональные данны,Потребители. Здоровье и благопо,"Потребители. Маркетинг, продажи",Потребители. Удовлетворенность,Потребители. Качество и безопас,Потребители. Ценовая политика,Поставщики в целом,Малый и локальный бизнес,Поставщики. Работники,Поставщики. Экология,Закупки и антикоррупция,Заинтересованные стороны,Коренные народы и местные сообщ,Сотрудники. Волонтерство,Социальные инвестиции и благотв,max_scoreS,max_topicS,Отчетность и прозрачность,Отношения с инвесторами,Инновации,Кибербезопасность,Права человека,Лидерство,Риски,Этика и антикоррупция,Корпоративное управление,Устойчивое развитие,max_scoreG,max_topicG,max_topicESG
0,"""Аэрофлот"" изменил схему захода на посадку в М...",Ее скорректировали при приземлении во Внукове ...,«Аэрофлот - Российские авиалинии»,аэрофлот изменить схема заход посадка москва о...,0.0,0.0,0.0,0.0,0.00000,0.08111,0.00000,0.0,0.00000,0.08111,Отходы и циклическая экономика,0.27735,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27735,Персонал в целом,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Персонал в целом
1,"""Аэрофлот"" изменил схему захода на посадку в М...","Такая мера сократит и ""негативное влияние угле...",«Аэрофлот - Российские авиалинии»,аэрофлот изменить схема заход посадка москва т...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,Экология в целом,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,Персонал в целом,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Экология в целом
2,Абитуриентов из Татарстана приглашают пройти о...,"МТС, цифровая экосистема, и Российский институ...",МТС,абитуриент татарстан приглашать пройти отбор г...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.16984,0.0,0.00000,0.16984,Биоразнообразие,0.00000,0.00000,0.0,0.0,0.0,0.15076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15250,Потребители. Удовлетворенность,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Биоразнообразие
3,Абитуриентов из Татарстана приглашают пройти о...,В этом году предварительный отбор абитуриентов...,МТС,абитуриент татарстан приглашать пройти отбор г...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,0,0.00000,0.00000,0.0,0.0,0.0,0.15076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15076,Сотрудники. Наем и увольнение,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106,0.106,Устойчивое развитие,Сотрудники. Наем и увольнение
4,Абитуриентов из Татарстана приглашают пройти о...,Желающим принять участие в отборочном туре на ...,МТС,абитуриент татарстан приглашать пройти отбор г...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,Экология в целом,0.00000,0.09713,0.0,0.0,0.0,0.15076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15076,Сотрудники. Наем и увольнение,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Сотрудники. Наем и увольнение
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25992,Через Владивосток отправляется первая партия с...,"Вся Находка (всянаходка.рф), Находка, 14 марта...","«Русагро», группа компаний",через владивосток отправляться первый партия с...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,0,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0,0
25993,Самый мощный суперкомпьютер будет создан в России,На базе строящегося в Саратовской области цент...,СБЕРБАНК,самый мощный суперкомпьютер создать россия на ...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,Экология в целом,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,Персонал в целом,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Экология в целом
25994,Самый мощный суперкомпьютер будет создан в России,В декабре 2020 года между правительством облас...,СБЕРБАНК,самый мощный суперкомпьютер создать россия в д...,0.0,0.0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,Экология в целом,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,Персонал в целом,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Экология в целом
25995,Самый мощный суперкомпьютер будет создан в России,Площадь технологических модулей составит приме...,СБЕРБАНК,самый мощный суперкомпьютер создать россия пло...,0.0,0.0,0.0,0.0,0.12356,0.11471,0.00000,0.0,0.18257,0.18257,Экологичность продукта,0.00000,0.00000,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18677,Потребители. Удовлетворенность,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,Отчетность и прозрачность,Потребители. Удовлетворенность


In [None]:
df1 = df_cos.drop(["header", "cleaned_texts", "max_scoreE",	"max_topicE", "max_scoreS",	"max_topicS", "max_scoreG",	"max_topicG"], axis = 1)
df1 = df1.assign(company=df1["company"].str.split('; ')).explode("company").sort_values(by=["company"], ascending=[True]).reset_index(drop=True)
df1

Unnamed: 0,paragraph,company,Экология в целом,Климат,Энергия,Воздух,Вода,Отходы и циклическая экономика,Биоразнообразие,Рекультивация земель,Экологичность продукта,Персонал в целом,Обучение и развитие,Сотрудники. Вовлеченность и мот,Оплата труда,Сотрудники. Здоровье и благопол,Сотрудники. Наем и увольнение,Сотрудники. Корпоративная культ,Сотрудники. Безопасность и охра,Сотрудники. Профсоюз и Коллекти,Потребители. Доступность,Потребители. Сервис и коммуника,Потребители. Персональные данны,Потребители. Здоровье и благопо,"Потребители. Маркетинг, продажи",Потребители. Удовлетворенность,Потребители. Качество и безопас,Потребители. Ценовая политика,Поставщики в целом,Малый и локальный бизнес,Поставщики. Работники,Поставщики. Экология,Закупки и антикоррупция,Заинтересованные стороны,Коренные народы и местные сообщ,Сотрудники. Волонтерство,Социальные инвестиции и благотв,Отчетность и прозрачность,Отношения с инвесторами,Инновации,Кибербезопасность,Права человека,Лидерство,Риски,Этика и антикоррупция,Корпоративное управление,Устойчивое развитие,max_topicESG
0,"3 М. Дивиденды на каждую акцию - $0,7. Дата вы...","""Альфа-Банк""",0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.1543,0.0,0.00000,0.0,0.14003,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.1525,0.00000,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.15713,0.00000,0.000,0.24495,0.00000,0.00000,0.0,0.14535,0.00000,0.0,0.12217,0.106,Отношения с инвесторами
1,"Исследование проводилось по 20 критериям, кот...","""Альфа-Банк""",0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.27735,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0000,0.00000,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.00000,0.14022,0.000,0.00000,0.32444,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.000,Инновации
2,Альфа-Банк впервые с 2019 года занял лидирующу...,"""Альфа-Банк""",0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0000,0.00000,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.000,0.00000,0.22942,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.000,Инновации
3,- Во главе угля - Производство мяса для шашлык...,"""Альфа-Банк""",0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0000,0.0,0.14286,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0000,0.00000,0.1849,0.0,0.17961,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.000,0.00000,0.00000,0.37796,0.0,0.00000,0.07538,0.0,0.00000,0.000,Кибербезопасность
4,- Все на выручку - Что будет с курсом рубля по...,"""Альфа-Банк""",0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0000,0.00000,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.000,0.00000,0.00000,0.00000,0.0,0.00000,0.10660,0.0,0.00000,0.000,Риски
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36768,Красноборское г.п.: 25.04.2024 года с 11:00 до...,“Россети Ленэнерго”,0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.27735,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.15554,0.0000,0.04880,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.106,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.000,Персонал в целом
36769,Тосненское г.п.: 26.04.2024 года с 00:00 до 01...,“Россети Ленэнерго”,0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.27735,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.15554,0.0000,0.06776,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.106,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.000,Персонал в целом
36770,Тосненское г.п.: 26.04.2024 года с 00:00 до 01...,“Россети Ленэнерго”,0.0,0.0,0.0000,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.27735,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.15554,0.0000,0.06776,0.0000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.106,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.000,Персонал в целом
36771,"Фотографии предоставила пресс-служба ""Россети ...",“Россети Ленэнерго”,0.0,0.0,0.2037,0.0,0.00000,0.00000,0.0,0.0,0.18257,0.00000,0.0,0.0000,0.0,0.00000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0000,0.00000,0.0000,0.0,0.17961,0.0,0.0,0.0,0.0,0.0,0.00000,0.00000,0.000,0.00000,0.00000,0.37796,0.0,0.00000,0.07538,0.0,0.00000,0.000,Кибербезопасность


ACCURACY

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
le = LabelEncoder()
le.fit(pd.concat([df['ESG HSE'], df['Корректный топик ESG HSE']]))
pred = le.transform(df['ESG HSE'])
true = le.transform(df['Корректный топик ESG HSE'])

accuracy_score(true, pred)
#df.to_excel('results.xlsx', index=False)

0.7432432432432432