# Imports

In [1]:
!pip install pymorphy3

Collecting pymorphy3
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.3-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg2-python, pymorphy3
Successfully installed dawg2-python-0.9.0 pymorphy3-2.0.3 pymorphy3-dicts-ru-2.4.417150.4580142


In [2]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from pymorphy3 import MorphAnalyzer
import pymorphy3
import json
import numpy as np


In [None]:

with open("/kaggle/input/vkr-data/diary_entries_final (1).json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [None]:
def remove_old_keys(json_data):
    """Удаляет из JSON все ключи, которые меньше 1900"""
    return {key: value for key, value in json_data.items() if int(key) >= 1900}

filtered_data = remove_old_keys(data)


In [None]:
rows = []
for year, texts in filtered_data.items():
    for text in texts:
        rows.append({"year": int(year), "text": text})

df = pd.DataFrame(rows)



In [None]:
df

In [None]:
df.info()

In [None]:
df["decade"] = (df["year"] // 10) * 10
decade_counts = df["decade"].value_counts().sort_index()
decade_counts


In [None]:
df

# Register analysis

In [None]:
df_filtered_copy = df.copy()

In [None]:
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
morph = MorphAnalyzer()
russian_stopwords = stopwords.words('russian') + ['это', 'весь', 'который']

def preprocess(text):
    # Удаление специальных символов и чисел
    text = re.sub(r'[^а-яёА-ЯЁ]', ' ', text.lower())
    
    # Токенизация и лемматизация
    tokens = [morph.parse(word)[0].normal_form 
             for word in text.split() 
             if len(word) > 2 
             and word not in russian_stopwords]
    
    return tokens

df_filtered_copy['processed'] = df_filtered_copy['text'].apply(preprocess)
df_filtered_copy

In [None]:
from gensim import corpora, models
import pyLDAvis.gensim_models

# Создание словаря и корпуса
dictionary = corpora.Dictionary(df_filtered_copy['processed'])
corpus = [dictionary.doc2bow(text) for text in df_filtered_copy['processed']]

# Обучение LDA модели
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=15,  # Экспериментируйте с количеством тем
    passes=10,
    alpha='auto'
)

# Визуализация
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
from gensim.models import CoherenceModel

# Расчет когерентности
coherence_model = CoherenceModel(
    model=lda_model,
    texts=df_filtered_copy['processed'],
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"Coherence Score (C_v): {coherence_score:.3f}")

In [None]:
from collections import defaultdict

# Создание временных срезов
yearly_topics = defaultdict(list)
for year, text in zip(df_filtered_copy['year'], df_filtered_copy['processed']):
    bow = dictionary.doc2bow(text)
    topics = lda_model.get_document_topics(bow)
    yearly_topics[year].extend([t[0] for t in topics if t[1] > 0.3])

# Визуализация трендов
plt.figure(figsize=(15, 8))
for topic_id in range(15):
    counts = [len([t for t in yearly_topics[year] if t == topic_id]) 
             for year in sorted(yearly_topics)]
    plt.plot(sorted(yearly_topics), counts, label=f"Topic {topic_id}")

plt.legend()
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Создание TF-IDF матрицы
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(df_filtered_copy['processed'].apply(' '.join))

# Топ-20 значимых слов
feature_names = tfidf.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1
top_words = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)[:50]

print("Топ-20 ключевых слов:")
for word, score in top_words:
    print(f"{word}: {score:.2f}")

In [None]:
!pip install natasha

In [None]:
from natasha import (
    Doc,
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsNERTagger
)

# Инициализация компонентов
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
ner_tagger = NewsNERTagger(emb)

def extract_entities(text):
    # Создаем объект Doc
    doc = Doc(text)
    
    # Последовательная обработка
    doc.segment(segmenter)       # Сегментация на токены
    doc.tag_morph(morph_tagger)  # Морфологический разбор
    doc.tag_ner(ner_tagger)      # Извлечение сущностей
    
    # Нормализация и сбор результатов
    entities = []
    for span in doc.spans:
        span.normalize(morph_vocab)  # Приводим к нормальной форме
        entities.append(
            (span.normal, span.type)
        )
    
    return entities

# Применяем функцию к данным
df_filtered_copy['entities'] = df_filtered_copy['text'].apply(extract_entities)

# Извлекаем все сущности
all_entities = [ent for sublist in df_filtered_copy['entities'] for ent in sublist]

# Фильтруем персоны и локации
persons = [ent[0] for ent in all_entities if ent[1] == 'PER']
locations = [ent[0] for ent in all_entities if ent[1] == 'LOC']

print("Топ-10 персон:", Counter(persons).most_common(10))
print("Топ-10 локаций:", Counter(locations).most_common(10))

In [None]:
from razdel import sentenize

def extract_questions(text):
    """Извлекает предложения с вопросами из текста"""
    return [
        sentence.text.strip() 
        for sentence in sentenize(text) 
        if sentence.text.strip().endswith('?')
    ]

# Создаем список словарей с вопросами и исходными текстами
result = []
for _, row in df_filtered_copy.iterrows():
    questions = extract_questions(row['text'])
    for question in questions:
        result.append({
            'original_text': row['text'],
            'question': question,
            'year': row.get('year', None)  # если есть год
        })

# Создаем новый DataFrame
questions_df = pd.DataFrame(result)

# Просмотр результатов
print(f"Найдено вопросов: {len(questions_df)}")
questions_df.head(1005) 

# Linguistic features extraction

In [None]:
df_ling = df.copy()

In [None]:
df_ling["text"] = df_ling["text"].str.replace(r'\n|\t|\r|</p>|<p>', ' ', regex=True)

df_ling


In [None]:

# Словарь для хранения DataFrame
decade_dfs = {}

# Группировка и сохранение
for decade, group in df_ling.groupby('decade'):
    decade_dfs[decade] = group[["text", "year"]].copy()

    # Вывод информации
    print(f"\nДекада {decade}-{decade+9} ({len(decade_dfs[decade])} записей)")

In [None]:
decade_dfs[1900]

In [None]:
!python -m spacy download ru_core_news_lg

In [None]:
!pip install stanza

In [None]:
!nvcc --version
!pip install cupy-cuda12x

In [None]:
import stanza
import spacy
import cupy

# Инициализация моделей
if cupy.is_available():
    print("GPU доступен")
    spacy.prefer_gpu()
nlp_spacy = spacy.load("ru_core_news_lg")
stanza.download('ru')
nlp_stanza = stanza.Pipeline('ru', use_gpu=True)


In [None]:
indef_list = {'некто', 'нечто', 'некоторый', 'несколько', 'некий', 'кое-кто', "кое-что", "кое-какой", "кое-чей", "кто-то", "что-то", "какой-то", "чей-то", "кто-нибудь", "что-нибудь", "какой-нибудь", "чей-нибудь", "кто-либо", "что-либо", "какой-либо", "чей-либо"}

place_adv_list = {'вблизи',
                  'вверху',
                  'вдалеке',
                  'вдали',
                  'взаперти',
                  'вне',
                  'внизу',
                  'внутри',
                  'вовне',
                  'возле',
                  'вокруг',
                  'впереди',
                  'всюду',
                  'высоко',
                  'где',
                  'далеко',
                  'далёко',
                  'изнутри',
                  'навстречу',
                  'наособицу',
                  'невдалеке',
                  'недалеко',
                  'недалечко',
                  'неподалёку',
                  'низом',
                  'одаль',
                  'одесную',
                  'около',
                  'окрест',
                  'откуда',
                  'отсюда',
                  'передом',
                  'поблизости',
                  'повсюду',
                  'поодаль',
                  'посередине',
                  'посерёдке',
                  'посреди',
                  'посредине',
                  'прочь',
                  'рядом',
                  'сверху',
                  'сзади',
                  'слева',
                  'снаружи',
                  'снизу',
                  'спереди',
                  'справа',
                  'там',
                  'вверх',
                  'вниз',
                  'доселе',
                  'досель',
                  'дотуда',
                  'дотудова',
                  'изовсюду',
                  'кое-куда',
                  'кое-откуда',
                  'кой-куда',
                  'куда',
                  'куда угодно',
                  'куда-либо',
                  'куда-нибудь',
                  'куда-то',
                  'никуда',
                  'нигде',
                  'отколь',
                  'откуда',
                  'откуда-либо',
                  'откуда-то',
                  'откудова',
                  'отовсюду',
                  'отселе',
                  'отсель',
                  'отсюда',
                  'отсюдова',
                  'отсюду',
                  'оттелева',
                  'оттель',
                  'оттоль',
                  'оттуда',
                  'оттудова',
                  'сюда',
                  'туда',
                  'тут',
                  'туда-обратно',
                  'туда-сюда'
                  }

time_adv_list = {
'анадысь',
'ввек',
'вдалеке',
'вдали',
'весной',
'весною',
'вечером',
'вечор',
'вовремя',
'впоследствии',
'впредь',
'встарь',
'вчера',
'вчерась',
'давеча',
'давно',
'дальше',
'днём',
'днесь',
'днями',
'доднесь',
'долго',
'доле',
'долее',
'доныне',
'досветла',
'дотемна',
'древле',
'ежедневно',
'ежеквартально',
'ежемесячно',
'еженочно',
'еженощно',
'заблаговременно',
'завременно',
'завтра',
'задолго',
'зараз',
'заранее',
'засветло',
'засим',
'затем',
'затемно',
'зимой',
'зимою',
'издревле',
'иногда',
'испокон',
'каждодневно',
'каждомесячно',
'когда-либо',
'летом',
'навсегда',
'надолго',
'надысь',
'накануне',
'намедни',
'насовсем',
'наутро',
'невовремя',
'недавно',
'незадолго',
'несвоевременно',
'нонеча',
'нонче',
'ночию',
'ночью',
'ныне',
'нынече',
'нынче',
'однажды',
'отныне'
'первоначально',
'поднесь',
'подоле',
'подчас',
'позднее',
'поздно',
'позже',
'пока',
'покамест',
'покуда',
'поначалу',
'поныне',
'порой',
'после'
'послезавтра',
'прежде',
'ранее',
'рано',
'раньше',
'редко',
'сегодня',
'сейгод',
'сейчас',
'скоро',
'смальства',
'смолоду',
'сперва',
'спокон',
'сразу',
'стемна',
'сыздетства',
'сызмала',
'сызмалу',
'сызмальства',
'теперича',
'теперь',
'третёвось',
'третьёвось',
'утром',
'часто',
'ща',
'щас'
}

In [None]:
# Загрузка данных 
abstraction_df_noun = pd.read_csv('/kaggle/input/vkr-data/Slovar.r.ya..s.indeksom.konkretnosti.slov.csv')
abstraction_df_adj = pd.read_csv ('/kaggle/input/vkr-data/Slovar.r.ya..s.indeksom.konkretnosti.slov_.csv')


In [None]:
abstraction_df_noun

In [None]:
abstraction_df_adj

In [None]:
def normalize_noun(word):
    """Нормализация существительных с обработкой исключений"""
    try:
        parsed = morph.parse(str(word))[0]
        if 'NOUN' in parsed.tag:
            return parsed.normal_form.lower().strip()
        return str(word).lower().strip()
    except:
        return str(word).lower().strip()

def normalize_adj(word):
    """Нормализация прилагательных с обработкой исключений"""
    try:
        parsed = morph.parse(str(word))[0]
        if 'ADJF' in parsed.tag:
            return parsed.normal_form.lower().strip()
        return str(word).lower().strip()
    except:
        return str(word).lower().strip()

# Обработка существительных
normalized_dict_noun = defaultdict(list)

for raw_word, score in zip(abstraction_df_noun['word'], abstraction_df_noun['Индекс С/A']):
    normalized = normalize_noun(raw_word)
    normalized_dict_noun[normalized].append(float(score))  # Конвертация в float

abstraction_dict_noun = {}
for lemma, scores in normalized_dict_noun.items():
    abstraction_dict_noun[lemma] = sum(scores) / len(scores)  # Ручной расчет среднего

print({k: v for k, v in list(abstraction_dict_noun.items())[:5]})

# Обработка прилагательных
abstraction_df_adj = abstraction_df_adj.copy()
abstraction_df_adj['Индекс С/A'] = abstraction_df_adj['Индекс С/A'].astype(float)
abstraction_df_adj['normalized'] = abstraction_df_adj['w'].apply(normalize_adj)

abstraction_dict_adj = (
    abstraction_df_adj
    .groupby('normalized')['Индекс С/A']
    .mean()
    .to_dict()
)

print({k: float(v) for k, v in list(abstraction_dict_adj.items())[:5]})


In [None]:
import re
# Предварительная настройка
dim_patterns = [re.compile(rf'({s})[а-я]*$') for s in [
    'ик', 'ек', 'к', 'ок', 'ёк', 'ец', 'иц',
    'очк', 'ечк', 'оньк', 'еньк', 'ышк', 'ишк', 'ушк', 'юшк'
]]

In [None]:
from collections import deque
class RussianTextAnalyzer:
    def __init__(self, text):
        self.text = text
        self.text_len = len(text)
        self.spacy_doc = nlp_spacy(text)
        self.stanza_doc = nlp_stanza(text)
        self.words = [token.text for token in self.spacy_doc if not token.is_punct]
        self.abstraction_dict_noun = {k: float(v) for k, v in abstraction_dict_noun.items()}
        self.abstraction_dict_adj = {k: float(v) for k, v in abstraction_dict_adj.items()}

        # Инициализируем счетчики
        self.features = {}

    def analyze(self):
        self._lexical_features()
        self._syntactic_features()
        self._morphological_features()
        return self.features

    def _lexical_features(self):
        # Лексические признаки
        pos_counts = Counter(token.pos_ for token in self.spacy_doc)
        # morph_features_count = Counter(token.morph.to_dict() for token in self.spacy_doc)

        gram_spacy = {}
        for token in self.spacy_doc:
          if token.pos_ not in gram_spacy:
            gram_spacy[token.pos_] = Counter(token.morph.to_dict().values())
          else:
            gram_spacy[token.pos_] += Counter(token.morph.to_dict().values())

        gram_stanza = {}
        for sent in self.stanza_doc.sentences:
          for word in sent.words:
            if word.upos not in gram_stanza:
              gram_stanza[word.upos] = Counter(dict(item.split('=') for item in word.feats.split('|')).values() if word.feats else {})
            else:
              gram_stanza[word.upos] += Counter(dict(item.split('=') for item in word.feats.split('|')).values() if word.feats else {})



        def analyze_abstr():
            upos_lemmas = [
                (word.upos, word.lemma.lower().strip())
                for sent in self.stanza_doc.sentences
                for word in sent.words
            ]

            scores_noun = []
            scores_adj = []
            total_nouns = 0
            total_adj = 0

            for upos, lemma in upos_lemmas:
                if upos == 'NOUN':
                    total_nouns += 1
                    if lemma in self.abstraction_dict_noun:
                        scores_noun.append(float(self.abstraction_dict_noun[lemma]))  # Явное преобразование
                if upos == 'ADJ':
                    total_adj += 1
                    if lemma in self.abstraction_dict_adj:
                        scores_adj.append(float(self.abstraction_dict_adj[lemma]))

            # Расчет статистики
            mean_noun = sum(scores_noun)/len(scores_noun) if scores_noun else None
            mean_adj = sum(scores_adj)/len(scores_adj) if scores_adj else None
            sorted_scores_noun = sorted(scores_noun)
            sorted_scores_adj = sorted(scores_adj)
            n_noun = len(sorted_scores_noun)
            n_adj = len(sorted_scores_adj)
            median_noun = (
                sorted_scores_noun[n_noun//2]
                if n_noun % 2 else
                (sorted_scores_noun[n_noun//2-1] + sorted_scores_noun[n_noun//2])/2
            ) if scores_noun else None
            median_adj = (
                sorted_scores_adj[n_adj//2]
                if n_adj % 2 else
                (sorted_scores_adj[n_adj//2-1] + sorted_scores_adj[n_adj//2])/2
            ) if scores_adj else None

            return {
                'mean_score_noun': mean_noun,
                'mean_score_adj': mean_adj,
                'median_score_noun': median_noun,
                'median_score_adj': median_adj,
                'coverage_noun': len(scores_noun)/total_nouns if total_nouns else 0.0,
                'coverage_adj': len(scores_adj)/total_adj if total_adj else 0.0,
                'total_nouns': total_nouns,
                'total_adj': total_adj
            }

        abstr_pos = analyze_abstr()
        self.features.update({
            'first_person_pronouns_sing': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=First' in t.morph
                                                and 'Number=Sing' in t.morph
                                              ),
            'first_person_pronouns_plur': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=First' in t.morph
                                                and 'Number=Plur' in t.morph
                                              ),
            'second_person_pronouns_sing': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=Second' in t.morph
                                                and 'Number=Sing' in t.morph
                                              ),
            'second_person_pronouns_plur': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=Second' in t.morph
                                                and 'Number=Plur' in t.morph
                                              ),
            'third_person_pronouns_masc': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=Third' in t.morph
                                                and 'Gender=Masc' in t.morph
                                              ),
            'third_person_pronouns_fem': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=Third' in t.morph
                                                and 'Gender=Fem' in t.morph
                                            ),
            'third_person_pronouns_neut': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=Third' in t.morph
                                                and 'Gender=Neut' in t.morph
                                             ),
            'third_person_pronouns_plur': sum(
                                                1
                                                for t in self.spacy_doc
                                                if t.pos_ == 'PRON'
                                                and 'Person=Third' in t.morph
                                                and 'Number=Plur' in t.morph
                                             ),
            'demonstrative_pronouns': gram_stanza.get('DET', Counter()).get('Dem',0),
            'prepositions': pos_counts.get('ADP', 0),
            'coordinationg_conjunctions': pos_counts.get('CCONJ', 0),
            'indefinite_pronouns': sum(
                                        1
                                        for sent in self.stanza_doc.sentences
                                        for word in sent.words
                                        if word.lemma.lower() in indef_list
                                        and word.upos in {'DET', 'PRON'}
                                    ),
            'place_adverbials': sum(
                                        1
                                        for sent in self.stanza_doc.sentences
                                        for word in sent.words
                                        if word.lemma.lower() in place_adv_list
                                        and word.upos in {'ADV'}
                                    ),
            'time_adverbials': sum(
                                        1
                                        for sent in self.stanza_doc.sentences
                                        for word in sent.words
                                        if word.lemma.lower() in time_adv_list
                                        and word.upos in {'ADV'}
                                    ),
            'noun_anim': sum(
                                  1
                                  for t in self.spacy_doc
                                  if t.pos_ == 'NOUN'
                                  and 'Animacy=Anim' in t.morph
                                ),
            'noun_inan': sum(
                              1
                              for t in self.spacy_doc
                              if t.pos_ == 'NOUN'
                              and 'Animacy=Inan' in t.morph
                            ),
            'noun_abstr_index': abstr_pos['mean_score_noun'],
            'adj_abstr_index': abstr_pos['mean_score_adj'],
            'latin_letters': sum(1
                                    for token in self.spacy_doc
                                    if any('LATN' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'propr_name': sum(1
                                    for token in self.spacy_doc
                                    if any('Name' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'patr_name': sum(1
                                    for token in self.spacy_doc
                                    if any('Patr' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'sur_name': sum(1
                                    for token in self.spacy_doc
                                    if any('Surn' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'praedicative': sum(1
                                    for token in self.spacy_doc
                                    if any('PRED' in parse.tag for parse in morph.parse(token.text)) or any('Prdx' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'geo_name': sum(
                              1
                              for token in self.spacy_doc
                              if token.pos_ == "PROPN"
                              if any('Geox' in parse.tag for parse in morph.parse(token.text))
                              ),
            'intj': sum(1
                                    for token in self.spacy_doc
                                    if any('INTJ' in parse.tag for parse in morph.parse(token.text))
                                  )
        })


    def _syntactic_features(self):
        # Синтаксические признаки
        sentence_lengths = [len(sent.text.split()) for sent in self.spacy_doc.sents]

        def is_minor_constituent(word, sentence):
            """Определяет тип сочинительной конструкции"""
            if word.head == 0:
                return None

            head = sentence.words[word.head - 1]

            # Определяем тип конструкции
            if word.deprel == 'conj':
                if head.upos == 'NOUN' and word.upos == 'NOUN':
                    return 'NOUN'
                elif head.upos == 'ADJ' and word.upos == 'ADJ':
                    return 'ADJ'
                elif head.upos == 'VERB' and word.upos == 'VERB':
                    return 'VERB'
                elif head.upos == 'ADV' and word.upos == 'ADV':
                    return 'ADV'
                elif head.deprel in {'obj', 'nsubj', 'nmod'}:
                    return 'NOUN'  # Для именных дополнений
                elif head.deprel == 'amod':
                    return 'ADJ'   # Для определений
                elif head.deprel == 'advmod':
                    return 'ADV'   # Для обстоятельств

            return None

        def extract_minor_coordinations(text):
            doc = self.stanza_doc
            coordination_counts = {
                'NOUN': 0,
                'ADJ': 0,
                'VERB': 0,
                'ADV': 0,
                'OTHER': 0
            }

            for sentence in doc.sentences:
                for word in sentence.words:
                    if word.deprel == 'conj':
                        const_type = is_minor_constituent(word, sentence)
                        if const_type:
                            if const_type in coordination_counts:
                                coordination_counts[const_type] += 1
                            else:
                                coordination_counts['OTHER'] += 1

            return coordination_counts

        coordination_counts = extract_minor_coordinations(text)

        def analyze_tree(text):
            doc = self.stanza_doc
            features = {
                'max_tree_depth': 0,
                'avg_np_length': 0.0,
                'avg_vp_length': 0.0,
                'inversion_count': 0,
                'ellipsis_count': 0
            }

            all_depths = []
            np_lengths = []
            vp_lengths = []

            for sentence in doc.sentences:
                # 2.1. Глубина дерева
                depths = _calculate_depths(sentence)
                all_depths.extend(depths)

                # 2.2. Длина NP/VP
                nps = _extract_phrases(sentence, 'NP')
                vps = _extract_phrases(sentence, 'VP')
                np_lengths.extend([len(np) for np in nps])
                vp_lengths.extend([len(vp) for vp in vps])

                # 2.3. Инверсии
                features['inversion_count'] += _count_inversions(sentence)

                # 2.4. Эллипсис
                features['ellipsis_count'] += sum(1 for word in sentence.words
                                                if word.deprel == 'orphan')

            # Расчет итоговых значений
            if all_depths:
                features['max_tree_depth'] = max(all_depths)
            if np_lengths:
                features['avg_np_length'] = sum(np_lengths)/len(np_lengths)
            if vp_lengths:
                features['avg_vp_length'] = sum(vp_lengths)/len(vp_lengths)

            return features

        def _calculate_depths(sentence):
            """Улучшенный расчет глубины дерева с использованием BFS"""
            depths = []
            root = next((word for word in sentence.words if word.head == 0), None)
            if not root:
                return []

            queue = deque([(root, 0)])
            visited = set()

            while queue:
                word, depth = queue.popleft()
                if word.id in visited:
                    continue
                visited.add(word.id)
                depths.append(depth)

                # Добавляем дочерние узлы
                children = [w for w in sentence.words if w.head == word.id]
                for child in children:
                    queue.append((child, depth + 1))

            return depths

        def _extract_phrases(sentence, phrase_type):
            """Улучшенное извлечение фраз с фильтрацией"""
            phrases = []
            targets = {
                'NP': ['NOUN', 'PROPN', 'PRON'],
                'VP': ['VERB', 'AUX']
            }

            for word in sentence.words:
                if word.upos in targets[phrase_type]:
                    phrase = _get_phrase(sentence, word.id)
                    if _is_valid_phrase(phrase_type, sentence, phrase):
                        phrases.append(phrase)

            return phrases

        def _get_phrase(sentence, head_id):
            """Поиск в ширину для более точного определения границ фразы"""
            phrase = []
            queue = deque([head_id])
            visited = set()

            while queue:
                current_id = queue.popleft()
                if current_id in visited:
                    continue
                visited.add(current_id)

                phrase.append(current_id)
                current_word = sentence.words[current_id-1]

                # Добавляем только непосредственные зависимые
                children = [w.id for w in sentence.words
                          if w.head == current_id
                          and w.deprel not in ['punct', 'cc', 'mark']]
                queue.extend(children)

            return sorted(phrase)

        def _is_valid_phrase(phrase_type, sentence, phrase_ids):
            """Проверка валидности извлеченной фразы"""
            if len(phrase_ids) < 1:
                return False

            main_word = sentence.words[phrase_ids[0]-1]

            if phrase_type == 'NP':
                return main_word.upos in ['NOUN', 'PROPN', 'PRON']
            elif phrase_type == 'VP':
                return main_word.upos in ['VERB', 'AUX']
            return False

        def _count_inversions(sentence):
            """Улучшенный подсчет инверсий"""
            inversions = 0
            for word in sentence.words:
                if word.deprel == 'nsubj':
                    verb = sentence.words[word.head-1]
                    # Более гибкое условие для русского языка
                    if word.id > verb.id and (word.id - verb.id) >= 1:
                        context = sentence.words[verb.id-1:word.id]
                        if not any(w.deprel == 'advmod' for w in context):
                            inversions += 1
            return inversions

        tree_analysis = analyze_tree(text)

        def count_syllables_ru(word):
            """Улучшенный подсчет слогов для русского языка"""
            vowels = 'аеёиоуыэюя'
            word = word.lower()
            count = 0
            prev_vowel = False

            for char in word:
                if char in vowels:
                    if not prev_vowel:  # Учитываем только последовательные гласные как один слог
                        count += 1
                    prev_vowel = True
                else:
                    prev_vowel = False

            # Гарантируем минимум 1 слог для коротких слов
            return max(1, count)

        def flesch_kincaid_russian(text):
            
            try:

                # Сбор статистики
                sentences = self.stanza_doc.sentences
                num_sentences = len(sentences)
                words = [word.text for sent in sentences for word in sent.words]
                num_words = len(words)

                if num_sentences == 0 or num_words == 0:
                    return 0.0

                # Подсчет слогов
                total_syllables = sum(count_syllables_ru(word) for word in words)

                # Расчет показателей
                ASL = num_words / num_sentences  # Average Sentence Length
                ASW = total_syllables / num_words  # Average Syllables per Word

                # Применение формулы
                score = 206.835 - 1.52 * ASL - 65.14 * ASW

                # Ограничение диапазона 0-100
                return max(0, min(100, round(score, 2)))

            except Exception as e:
                print(f"Ошибка при обработке текста: {e}")
                return 0.0

        flesch_kincaid_index = flesch_kincaid_russian(self.text)


        self.features.update({
            'mean_sentence_length': sum(sentence_lengths)/len(sentence_lengths) if sentence_lengths else 0,
            'subordinate_clauses': sum(1 for token in self.spacy_doc if token.dep_ == 'mark' or token.pos_ == 'SCONJ'),
            'type-token ratio': len(set(self.words)) / len(self.words) * 100 if self.words else 0,
            'word length': sum(len(word) for word in self.words) / len(self.words),
            'noun_coordination': coordination_counts['NOUN'],
            'adj_coordination': coordination_counts['ADJ'],
            'verb_coordination': coordination_counts['VERB'],
            'adv_coordination': coordination_counts['ADV'],
            'other_coordination': coordination_counts['OTHER'],
            'max_tree_depth': tree_analysis['max_tree_depth'],
            'avg_np_length': tree_analysis['avg_np_length'],
            'avg_vp_length': tree_analysis['avg_vp_length'],
            'inversion_count': tree_analysis['inversion_count'],
            'ellipsis_count': tree_analysis['ellipsis_count'],
            'flesch_kincaid_index': flesch_kincaid_index
        })

    def _morphological_features(self):
        # Морфологические признаки

        self.features.update({
            'perfect_aspect': sum(1 for token in self.spacy_doc if  token.pos_ == 'VERB' and 'Aspect=Perf' in token.morph),
            'imperfect_aspect': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Aspect=Imp' in token.morph),
            'past_tense': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Tense=Past' in token.morph),
            'present_tense': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Tense=Pres' in token.morph),
            'fut_tense': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Tense=Fut' in token.morph),
            'ind_mood_verb': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Mood=Ind' in token.morph),
            'imp_mood_verb': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Mood=Imp' in token.morph),
            'cnd_mood_verb': sum(1 for token in self.spacy_doc if token.pos_ == 'AUX' and 'Mood=Cnd' in token.morph),
            'gerunds': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'VerbForm=Conv' in token.morph),
            'participles': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'VerbForm=Part' in token.morph),
            'infinitives': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'VerbForm=Inf' in token.morph),
            'finite_verbs': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'VerbForm=Fin' in token.morph),
            'passive_voice': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Voice=Pass' in token.morph),
            'active_voice': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Voice=Act' in token.morph),
            'middle_voice': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and 'Voice=Mid' in token.morph),
            'neg_polarity': sum(1 for token in self.spacy_doc if token.pos_ == 'PART' and 'Polarity=Neg' in token.morph),
            'first_pers_verb_sing': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and ('Number=Sing' and 'Person=First') in token.morph),
            'second_pers_verb_sing': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and ('Number=Sing' and 'Person=Second') in token.morph),
            'third_pers_verb_sing': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and ('Number=Sing' and 'Person=Third') in token.morph),
            'first_pers_verb_plur': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and ('Number=Plur' and 'Person=First') in token.morph),
            'second_pers_verb_plur': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and ('Number=Plur' and 'Person=Second') in token.morph),
            'third_pers_verb_plur': sum(1 for token in self.spacy_doc if token.pos_ == 'VERB' and ('Number=Plur' and 'Person=Third') in token.morph),
            'trans_verb': sum(
                              1
                              for token in self.spacy_doc
                              if token.pos_ == "VERB"
                              if any('tran' in parse.tag for parse in morph.parse(token.text))
                              ),
            'intr_verb': sum(
                              1
                              for token in self.spacy_doc
                              if token.pos_ == "VERB"
                              if any('intr' in parse.tag for parse in morph.parse(token.text))
                              ),
            'not_inv_verb': sum(
                              1
                              for token in self.spacy_doc
                              if token.pos_ == "VERB"
                              if any('excl' in parse.tag for parse in morph.parse(token.text))
                              ),
            'sing_noun': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Number=Sing' in token.morph),
            'plur_noun': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Number=Plur' in token.morph),
            'plr_tantum_noun': sum(
                                    1
                                    for token in self.spacy_doc
                                    if token.pos_ == "NOUN"
                                    if any('Pltm' in parse.tag for parse in morph.parse(token.text))
                                   ),
            'sing_tantum_noun': sum(
                                    1
                                    for token in self.spacy_doc
                                    if token.pos_ == "NOUN"
                                    if any('Sgtm' in parse.tag for parse in morph.parse(token.text))
                                   ),
            'noun_fem': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Gender=Fem' in token.morph),
            'noun_masc': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Gender=Masc' in token.morph),
            'noun_neut': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Gender=Neut' in token.morph),
            'noun_case_nom': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Case=Nom' in token.morph),
            'noun_case_gen': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Case=Gen' in token.morph),
            'noun_case_dat': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Case=Dat' in token.morph),
            'noun_case_acc': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Case=Acc' in token.morph),
            'noun_case_loc': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Case=Loc' in token.morph),
            'noun_case_ins': sum(1 for token in self.spacy_doc if token.pos_ == 'NOUN' and 'Case=Ins' in token.morph),
            'noun_case_voc': sum(
                                    1
                                    for token in self.spacy_doc
                                    if token.pos_ == "NOUN"
                                    if any('voct' in parse.tag for parse in morph.parse(token.text))
                                ),
            'fixed_noun': sum(
                                    1
                                    for token in self.spacy_doc
                                    if token.pos_ == "NOUN"
                                    if any('Fixd' in parse.tag for parse in morph.parse(token.text))
                                ),
            'sing_adj': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Number=Sing' in token.morph),
            'plur_adj': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Number=Plur' in token.morph),
            'adj_fem': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Gender=Fem' in token.morph),
            'adj_masc': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Gender=Masc' in token.morph),
            'adj_neut': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Gender=Neut' in token.morph),
            'adj_case_nom': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Case=Nom' in token.morph),
            'adj_case_gen': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Case=Gen' in token.morph),
            'adj_case_dat': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Case=Dat' in token.morph),
            'adj_case_acc': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Case=Acc' in token.morph),
            'adj_case_loc': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Case=Loc' in token.morph),
            'adj_case_ins': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Case=Ins' in token.morph),
            'adj_case_voc': sum(
                                    1
                                    for token in self.spacy_doc
                                    if token.pos_ == "ADJ"
                                    if any('voct' in parse.tag for parse in morph.parse(token.text))
                                ),
            'adj_degree_pos': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Degree=Pos' in token.morph),
            'adj_degree_comp': sum(1 for token in self.spacy_doc if token.pos_ == 'ADJ' and 'Degree=Cmp' in token.morph),
            'adj_degree_super': sum(1
                                    for token in self.spacy_doc
                                    if token.pos_ == 'ADJ'
                                    if any('Supr' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'full_adj': sum(1 for token in self.spacy_doc if any('ADJF' in parse.tag for parse in morph.parse(token.text))),
            'shrot_adj': sum(1 for token in self.spacy_doc if any('ADJS' in parse.tag for parse in morph.parse(token.text)) and 'StyleVariant=Short' in token.morph),
            'adv_degree_pos': sum(1 for token in self.spacy_doc if token.pos_ == 'ADV' and 'Degree=Pos' in token.morph),
            'adv_degree_comp': sum(1 for token in self.spacy_doc if token.pos_ == 'ADV' and 'Degree=Cmp' in token.morph),
            'quant_num': sum(1 for token in self.spacy_doc if token.pos_ == 'NUM'),
            'anum_num': sum(1
                                    for token in self.spacy_doc
                                    if token.pos_ == 'ADJ'
                                    if any('Anum' in parse.tag for parse in morph.parse(token.text))
                                  ),
            'dim_nouns': sum(1 for t in self.spacy_doc  if t.pos_ == 'NOUN' and any(p.search(t.lemma_.lower()) for p in dim_patterns)),
            'dim_adj': sum(1 for t in self.spacy_doc  if t.pos_ == 'ADJ' and any(p.search(t.lemma_.lower()) for p in dim_patterns))
        })

In [None]:
total_rows = len(df_ling) # всего строк
processed_rows = 0 # обработано строк
features_list = []

# Обрабатываем тексты и сохраняем результаты
for idx, text in enumerate(df_ling['text']):
    if pd.isna(text) or len(text.strip()) == 0:
        processed_rows += 1
        features_list.append({})
        continue

    try:
        analyzer = RussianTextAnalyzer(text)
        features = analyzer.analyze()
        features_list.append(features)
    except Exception as e:
        print(f"Ошибка при обработке текста: {e}")
        features_list.append({})  # Добавляем пустой словарь при ошибке

    processed_rows += 1

    # Выводим статистику (каждые 100 обработанных строк)
    if processed_rows % 100 == 0:
        percentage_complete = (processed_rows / total_rows) * 100
        print(f"Обработано строк: {processed_rows}/{total_rows} ({percentage_complete:.2f}%)")


# Создаем временный DataFrame с результатами
temp_df = pd.DataFrame(features_list).add_suffix('_abs')

# Удаляем существующие столбцы перед объединением
existing_cols = df_ling.columns.intersection(temp_df.columns)
df_ling = df_ling.drop(columns=existing_cols)

# Объединяем с исходным DataFrame
df_ling = pd.concat([df_ling.reset_index(drop=True),
                    temp_df.reset_index(drop=True)], axis=1)

print(f"Всего обработано строк: {processed_rows}")

# Сохраняем DataFrame в CSV файл 
df_ling.to_csv('output_with_params_prozhito.csv', index=False, encoding='utf-8')

print(f"DataFrame сохранен в файл: output_with_params_prozhito.csv")




In [None]:
df_ling.to_csv('output_with_params.csv', index=False, encoding='utf-8')


In [None]:
df_ling.info()