In [8]:
!pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.5-py3-none-any.whl.metadata (19 kB)
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.7.0-py3-none-any.whl.metadata (6.7 kB)
Downloading pingouin-0.5.5-py3-none-any.whl (204 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.4/204.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pandas_flavor-0.7.0-py3-none-any.whl (8.4 kB)
Installing collected packages: pandas-flavor, pingouin
Successfully installed pandas-flavor-0.7.0 pingouin-0.5.5


In [2]:
import pandas as pd
import numpy as np

pt_norm = pd.read_csv('/kaggle/input/new-data/normalized_feature_matrix_pishu_tebe (1).csv')
pr_norm = pd.read_csv('/kaggle/input/new-data/normalized_feature_matrix_prozhito (1).csv')

In [None]:
pt_norm

In [None]:
pr_norm

In [3]:
feature_columns = [col for col in pr_norm.columns if col not in ['postcard_text', 'decade', 'Unnamed: 0', 'year', 'text', 'other_coordination_abs']]
len(feature_columns)

102

In [4]:
for col in feature_columns:
        pr_norm[col] = pr_norm[col].fillna(0).astype('float64')

nan_cols = pr_norm.columns[pr_norm.isnull().any()]
print("Столбцы, содержащие NaN:", nan_cols)

Столбцы, содержащие NaN: Index([], dtype='object')


In [5]:
for col in feature_columns:
        pt_norm[col] = pt_norm[col].fillna(0).astype('float64')

nan_cols = pt_norm.columns[pt_norm.isnull().any()]
print("Столбцы, содержащие NaN:", nan_cols)

Столбцы, содержащие NaN: Index([], dtype='object')


In [6]:
pt_norm = pt_norm.drop('other_coordination_abs', axis=1)
pr_norm = pr_norm.drop('other_coordination_abs', axis=1)

In [9]:
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import pingouin as pg

def analyze_large_corpora(corpus1_df, corpus2_df, features, chunk_size=10000):
    """
    Анализ предзагруженных корпусов
    
    Параметры:
    corpus1_df, corpus2_df - предзагруженные DataFrame
    features - список признаков для анализа
    chunk_size - размер чанка для обработки (по умолчанию 10,000)
    """
    
    # Инициализация результатов
    results = {feature: {'mean1': 0.0, 'mean2': 0.0, 'd': 0.0, 'p': 1.0} 
               for feature in features}
    
    # Обработка корпуса 1
    total1 = 0
    for i in range(0, len(corpus1_df), chunk_size):
        chunk = corpus1_df.iloc[i:i+chunk_size]
        total1 += len(chunk)
        for feature in features:
            results[feature]['mean1'] += chunk[feature].sum()

    # Обработка корпуса 2
    total2 = 0
    for i in range(0, len(corpus2_df), chunk_size):
        chunk = corpus2_df.iloc[i:i+chunk_size]
        total2 += len(chunk)
        for feature in features:
            results[feature]['mean2'] += chunk[feature].sum()

    # Расчет финальных средних
    for feature in features:
        results[feature]['mean1'] /= total1 if total1 > 0 else 1
        results[feature]['mean2'] /= total2 if total2 > 0 else 1

    # Расчет статистик
    p_values = []
    cohens_d_values = []
    for feature in features:
        # Данные для признака
        data1 = corpus1_df[feature].values
        data2 = corpus2_df[feature].values
        
        # t-тест Уэлча
        try:
            _, p = ttest_ind(data1, data2, equal_var=False, nan_policy='omit')
        except:
            p = 1.0
        
        # Cohen's d
        try:
            # n1, n2 = len(data1), len(data2)
            # var1 = np.nanvar(data1, ddof=1)
            # var2 = np.nanvar(data2, ddof=1)
            # pooled_var = ((n1-1)*var1 + (n2-1)*var2) / (n1 + n2 - 2)
            # pooled_std = np.sqrt(pooled_var) if pooled_var > 0 else 0
            # d = (np.nanmean(data1) - np.nanmean(data2)) / pooled_std if pooled_std != 0 else 0
            d = pg.compute_effsize(data1, data2, eftype='cohen')
        except:
            d = 0
        
        p_values.append(p)
        cohens_d_values.append(d)

    # Поправка FDR
    p_values = [1.0 if np.isnan(p) else p for p in p_values]
    _, p_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

    # Сбор результатов
    return pd.DataFrame({
        'feature': features,
        'mean_corpus1': [results[f]['mean1'] for f in features],
        'mean_corpus2': [results[f]['mean2'] for f in features],
        'p_value': p_values,
        'p_corrected': p_corrected,
        'cohens_d': cohens_d_values
    })

In [10]:
# Анализ
results = analyze_large_corpora(pt_norm, pr_norm, feature_columns)


In [11]:
results

Unnamed: 0,feature,mean_corpus1,mean_corpus2,p_value,p_corrected,cohens_d
0,first_person_pronouns_sing_abs,0.983569,1.754239,0.000000e+00,0.000000e+00,-0.382302
1,first_person_pronouns_plur_abs,0.427901,0.644574,8.840085e-93,1.218498e-92,-0.182795
2,second_person_pronouns_sing_abs,1.621871,0.079957,0.000000e+00,0.000000e+00,0.644734
3,second_person_pronouns_plur_abs,2.659149,0.071163,0.000000e+00,0.000000e+00,0.964767
4,third_person_pronouns_masc_abs,0.165786,0.951367,0.000000e+00,0.000000e+00,-0.749120
...,...,...,...,...,...,...
97,adv_degree_comp_abs,0.147542,0.247972,7.912044e-69,1.021555e-68,-0.149769
98,quant_num_abs,2.248116,3.049591,2.525784e-79,3.435066e-79,-0.175594
99,anum_num_abs,0.285902,0.364506,5.565229e-16,6.103799e-16,-0.072526
100,dim_nouns_abs,3.783056,6.891538,0.000000e+00,0.000000e+00,-0.757918


In [12]:
# Фильтрация значимых результатов
significant = results[(results['p_corrected'] < 0.05) & (abs(results['cohens_d']) >= 0.5)]

In [13]:
significant

Unnamed: 0,feature,mean_corpus1,mean_corpus2,p_value,p_corrected,cohens_d
2,second_person_pronouns_sing_abs,1.621871,0.079957,0.0,0.0,0.644734
3,second_person_pronouns_plur_abs,2.659149,0.071163,0.0,0.0,0.964767
4,third_person_pronouns_masc_abs,0.165786,0.951367,0.0,0.0,-0.74912
7,third_person_pronouns_plur_abs,0.067739,0.372015,0.0,0.0,-0.533334
9,prepositions_abs,8.555716,11.04136,0.0,0.0,-0.57744
11,indefinite_pronouns_abs,0.032873,0.247481,0.0,0.0,-0.507974
17,adj_abstr_index_abs,2.377263,0.60873,0.0,0.0,0.560582
19,propr_name_abs,9.64815,5.727165,0.0,0.0,0.65564
25,mean_sentence_length_abs,26.322132,16.97214,0.0,0.0,0.511523
33,max_tree_depth_abs,15.418883,9.892609,0.0,0.0,0.638776


In [12]:
significant

Unnamed: 0,feature,mean_corpus1,mean_corpus2,p_value,p_corrected,cohens_d
2,second_person_pronouns_sing_abs,1.621871,0.079957,0.0,0.0,0.644734
3,second_person_pronouns_plur_abs,2.659149,0.071163,0.0,0.0,0.964767
4,third_person_pronouns_masc_abs,0.165786,0.951367,0.0,0.0,-0.74912
7,third_person_pronouns_plur_abs,0.067739,0.372015,0.0,0.0,-0.533334
9,prepositions_abs,8.555716,11.04136,0.0,0.0,-0.57744
11,indefinite_pronouns_abs,0.032873,0.247481,0.0,0.0,-0.507974
17,adj_abstr_index_abs,2.377263,0.60873,0.0,0.0,0.560582
19,propr_name_abs,9.64815,5.727165,0.0,0.0,0.65564
25,mean_sentence_length_abs,26.322132,16.97214,0.0,0.0,0.511523
33,max_tree_depth_abs,15.418883,9.892609,0.0,0.0,0.638776


In [6]:
!python -m spacy download ru_core_news_md

Collecting ru-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.7.0/ru_core_news_md-3.7.0-py3-none-any.whl (41.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-md==3.7.0)
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->ru-core-news-md==3.7.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-md==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.3-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Download

In [7]:
!nvcc --version
!pip install cupy-cuda12x

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [9]:
import cupy
import spacy
if cupy.is_available():
    print("GPU доступен")
    spacy.prefer_gpu()

GPU доступен


In [13]:
import pandas as pd
import random

def get_random_perfect_verb_contexts(df, text_column='text', n_contexts=50, window_size=10, language_model="ru_core_news_md"):
    """
    Извлекает случайные контексты с совершенными формами глаголов из DataFrame.
    """
    nlp = spacy.load(language_model)
    all_contexts = []

    for text in df[text_column]:
        if isinstance(text, str):
            doc = nlp(text)
            for token in doc:
                if token.pos_ == 'VERB' and 'Aspect=Perf' in token.morph:
                    start = max(0, token.i - window_size)
                    end = min(len(doc), token.i + window_size + 1)
                    context = [t.text for t in doc[start:end]]
                    all_contexts.append(context)

    if len(all_contexts) < n_contexts:
        print(f"В тексте меньше, чем {n_contexts} контекстов с совершенными глаголами.")
        return all_contexts # Возвращаем все найденные контексты
    else:
        return random.sample(all_contexts, n_contexts)

In [10]:

random_contexts = get_random_perfect_verb_contexts(pt_norm, n_contexts=50)
print(random_contexts)

[['!', 'Все', 'еще', 'по', 'Майски', '[', '^по', '-', 'майски', ']', 'зелено', '!', 'Было', 'только', 'очень', 'холодно', 'ночью', ',', ' ', '[', 'нрзб'], ['не', 'помещаются', 'в', 'посылку', '.', 'Мой', 'подарок', '-', 'это', 'я', 'выписала', 'тебе', 'Детскую', 'Советскую', 'Энциклопедию', 'в', '10', 'томах', '.', 'Целую', 'тебя'], ['Ленинград', '?', 'Как', 'живешь', '?', 'Целую', 'тебя', 'и', 'дочек', '.', 'Передай', 'от', 'нас', 'сердечный', 'привет', 'всем', 'своим', '.', '<', 'подпись', '>'], ['Жить', 'буду', 'у', 'Егорова', '.', 'Посылаю', 'тебе', 'это', 'письмо', 'Не', 'сердись', '!', 'Лучше', 'ска[нрзб', ']', 'все', '!', 'Целую', '.'], ['чадам', 'большим', 'и', 'маленьким', 'доброго', 'здоровья', ',', 'благополучия', '.', 'Сегодня', 'получила', 'твою', 'поздравительную', 'открытку', 'и', 'очень', 'была', 'обрадована', 'так', ',', 'как'], ['примите', ',', 'вексель', 'ему', 'выдайте', ',', 'а', 'деньги', 'вместо', 'тех', 'запишите', 'на', 'мой', 'тек', '.', 'счет', '.', 'Посл', '

In [14]:
random_contexts = get_random_perfect_verb_contexts(pr_norm, n_contexts=50)
print(random_contexts)

[['«', 'почтальон', '»', ')', 'не', 'хочет', 'пройти', 'вниз', ',', 'чтобы', 'закрыть', 'батометры', '.', '    ', 'В', 'шесть', 'утра', 'Рудольф', 'сообщил', ',', 'что'], ['Пиковой', '»', 'же', 'публика', 'была', 'холодна', ',', 'и', 'он', 'был', 'расстроен', ',', 'хотя', 'отлично', 'знал', ',', 'что', 'кроме', 'первого', 'действия', 'все'], ['.', 'Вчера', 'были', 'оба', 'доктора', ':', 'Тихонов', 'и', 'Альтшуллер', '.', 'Прописали', 'два', 'раза', 'в', 'неделю', 'экстракт', 'крушины', '(', 'растение', ')', 'в'], ['себя', ',', 'подчинитв', 'и', 'унизив', 'других', 'людей', '.', 'Дай', 'бог', 'встретиться', 'после', 'войны', 'где', '-', 'нибудь', 'в', 'Москве', '—', 'и', 'руки'], ['—', '«', 'Были', 'там', 'всякие', '.', '»', '—', 'Он', 'снова', 'усмехнулся', '.', '—', '«', 'Им', 'другой', 'почет', ',', 'командирам', '.', 'Они'], ['на', 'полторы', '-', 'две', 'страницы', 'свой', 'отзыв', '.', 'Возможно', ',', 'добавил', 'Борис', 'Васильевич', ',', 'это', 'вызовет', 'даже', 'на', 'разгово