In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install lexical_diversity

Collecting lexical_diversity
  Downloading lexical_diversity-0.1.1-py3-none-any.whl.metadata (4.1 kB)
Downloading lexical_diversity-0.1.1-py3-none-any.whl (117 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m112.6/117.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lexical_diversity
Successfully installed lexical_diversity-0.1.1


In [None]:
from lexical_diversity import lex_div as ld

In [None]:

def calculate_metrics(text):
    text = text.lower()

    # Remove punctuation
    text_no_punct = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    words = word_tokenize(text_no_punct)

    stop_words = set(stopwords.words('russian'))

    # Remove stop words
    words_no_stop = [word for word in words if word not in stop_words]

    # Remove digits
    words_no_digits = [word for word in words_no_stop if not word.isdigit()]

    # Calculate metrics on cleaned text
    text_clean = ' '.join(words_no_digits)

    metrics = {}

    # 1. MLTD
    mltd = ld.mtld(words_no_digits) if len(words_no_digits) > 0 else 0
    metrics['MLTD'] = mltd

    # 2. Ratio of Content Words to Stop Words
    content_words = [word for word in words_no_digits if word not in stop_words]
    ratio_content_stop = len(content_words) / len(stop_words) if len(stop_words) > 0 else 0
    metrics['ratio_content_stop'] = ratio_content_stop

    # 3. Text Diversity Measures
    def entropy(words):
        word_freq = Counter(words)
        total_words = len(words)
        entropy = -sum((freq / total_words) * math.log2(freq / total_words) for freq in word_freq.values())
        return entropy

    def hershey_diversity(words):
        unique_words = len(set(words))
        total_words = len(words)
        return unique_words / total_words if total_words > 0 else 0

    metrics['entropy'] = entropy(words_no_digits)
    metrics['hershey_diversity'] = hershey_diversity(words_no_digits)

    return metrics

def print_report(index, metrics):
    print(f'Текст {index}.')
    print('Метрики текста:')
    for metric, value in metrics.items():
        print(f'{metric}: {value}')
    print()

def calculate_group_metrics(metrics_list, group_indices):
    group_metrics = {
        'MLTD': [],
        'ratio_content_stop': [],
        'entropy': [],
        'hershey_diversity': []
    }

    for i in group_indices:
        if i < len(metrics_list):
            metrics = metrics_list[i]
            for key in group_metrics.keys():
                if key in metrics:
                    group_metrics[key].append(metrics[key])

    average_metrics = {}
    for key, values in group_metrics.items():
        average_metrics[key] = np.mean(values) if values else 0

    return average_metrics

def main():
    with open('/content/Texts.txt', 'r', encoding='windows-1251') as f:
        text_data = f.read()

    texts = []
    current_text = ""
    for line in text_data.splitlines():
        if line.startswith("Текст"):
            if current_text:
                texts.append(current_text)
            current_text = line
        else:
            current_text += " " + line
    texts.append(current_text)  # Add the last text

    metrics_list = []
    for i, text in enumerate(texts):
        metrics = calculate_metrics(text)
        print_report(i+1, metrics)
        metrics_list.append(metrics)

    # Specify the group indices (subtract 1 for 0-based index)
    group_indices1 = [i-1 for i in [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43]]
    group_indices2 = [i-1 for i in [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44]]
    group_indices3 = [i-1 for i in [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45]]

    # Calculate and print average metrics for specified groups
    average_metrics1 = calculate_group_metrics(metrics_list, group_indices1)
    average_metrics2 = calculate_group_metrics(metrics_list, group_indices2)
    average_metrics3 = calculate_group_metrics(metrics_list, group_indices3)

    print('Средние метрики для групп 1:')
    for metric, value in average_metrics1.items():
        print(f'{metric}: {value}')

    print('\nСредние метрики для групп 2:')
    for metric, value in average_metrics2.items():
        print(f'{metric}: {value}')

    print('\nСредние метрики для групп 3:')
    for metric, value in average_metrics3.items():
        print(f'{metric}: {value}')

if __name__ == "__main__":
    main()

Текст 1.
Метрики текста:
MLTD: 283.50000000000034
ratio_content_stop: 0.2980132450331126
entropy: 5.402964207440784
hershey_diversity: 0.9555555555555556

Текст 2.
Метрики текста:
MLTD: 65.83111111111113
ratio_content_stop: 0.304635761589404
entropy: 5.132257608230924
hershey_diversity: 0.8043478260869565

Текст 3.
Метрики текста:
MLTD: 69.63542903227022
ratio_content_stop: 0.7615894039735099
entropy: 5.939933829314663
hershey_diversity: 0.6347826086956522

Текст 4.
Метрики текста:
MLTD: 58.57144715447156
ratio_content_stop: 0.3443708609271523
entropy: 5.257756064128519
hershey_diversity: 0.8076923076923077

Текст 5.
Метрики текста:
MLTD: 134.45600000000002
ratio_content_stop: 0.32450331125827814
entropy: 5.410628211462147
hershey_diversity: 0.8979591836734694

Текст 6.
Метрики текста:
MLTD: 100.82978723404256
ratio_content_stop: 0.7417218543046358
entropy: 6.065074315720389
hershey_diversity: 0.7053571428571429

Текст 7.
Метрики текста:
MLTD: 196.63000000000005
ratio_content_stop: 0.3