In [1]:
# Input file
input_file = "political-regimes-data-text-full.csv"

# Which column contains the text?
text_column = "cleaned_text"

# Output files
output_file_words = "words.csv"
output_file_pairs = "words-and-documents.csv"

# Allowed types and custom entity keywords
allowed_types = ['EVENT', 'GPE', 'LOC', 'NORP', 'PERSON', 'ORG']
custom_entities = {
    'POL REGIMES': ["democracy", "totalitarianism", "totalitarian", "post-totalitarianism", "authoritarian",
                    "autocracy", "authoritarianism", "closed autocracy", "electoral autocracy", 
                    "hybrid regime", "presidentialism", "polyarchy", "anocracy", "illiberal",
                    "sultanism", "electoral democracy", "liberal democracy", "personalist", 
                    "façade", "democratic facade", "full democracy", "flawed democracy",
                    "oligarchy", "military", "party-based", "competitive authoritarianism",
                    "Junta", "democratization", "patron-client", "cooptation", "populist",
                    "corporatism", "racial democracy", "ethnic democracy", "apartheid", "repression",
                    "bossism", "mass mobilization", "corruption", "substantive democracy", "Separation of powers",
                    "independent judiciary", "charismatic", "ideology", "Representation", "Equality", "Participation",
                    "Liberty", "Transition", "delegative democracy", "Bureaucratic Authoritarianism", "Modernization",
                    "elite", "Cleavage", "pluralism", "tyranny", "Legitimation", "autocratization"],
    'POL INDEXES': ["Polity", "Freedom House", "The Economist", "V-Dem", "Bertelsmann", 
                    "Global State of Democracy", "Regimes of the World", "RoW"],
    'POL SCHOOLS': ["Behavioralism", "Behaviorism", "Post-behavioralism", "Positivism", "Institutionalism", 
                    "New Institutionalism", "Critical theory", "Constructivism", "public choice", 
                    "Rational choice", "Realism", "Rationalism", "Political Economy", "Marxism", "Transitology", "sociology",
                    "Geopolitics", "Liberalism"],
    'POL METHODS': ["Quantitative", "Qualitative", "Case study", "Game Theory", "Historical", 
                    "mixed-methods", "Cross-National", "regression", "comparative", "network", "Statistical"]
}   

# Convert all custom entity keywords to lower case for case-insensitive matching
custom_entities = {key: [item.lower() for item in value] for key, value in custom_entities.items()}

# Install (if needed)
#!pip install pandas
#!pip install spacy

# Import
import csv
import pandas as pd
import spacy
from spacy.lang.en import English

# Загрузка SpaCy English model
NER = spacy.load("en_core_web_sm")
NER.max_length = 1500000 

# Загрузка данных
doc_df = pd.read_csv(input_file, quotechar='"', encoding='utf8', doublequote=True, quoting=csv.QUOTE_NONNUMERIC, dtype=object, on_bad_lines='skip')
doc_index = {}

# Обработка текста
for index, row in doc_df.iterrows():
    text = row[text_column].lower()  # Малый регистр
    entities = {}

    # Обработка текста с помощью SpaCy NER
    nertxt = NER(text)
    for ne in nertxt.ents:
        if ne.label_ in allowed_types:
            entsign = ne.text.lower() + '-' + ne.label_
            if entsign not in entities:
                entities[entsign] = {'NE-text': ne.text, 'NE-type': ne.label_, 'NE-count': 0}
            entities[entsign]['NE-count'] += 1

    # Поиск custom entities
    for ctype, keywords in custom_entities.items():
        for keyword in keywords:
            if keyword in text:
                entsign = keyword + '-' + ctype
                if entsign not in entities:
                    entities[entsign] = {'NE-text': keyword, 'NE-type': ctype, 'NE-count': 0}
                entities[entsign]['NE-count'] += 1

    doc_index[index] = entities

# Сохранение данных
doc_notxt_df = doc_df.drop(columns=[text_column])
word_index = {}
for index, row in doc_notxt_df.iterrows():
    for entsign in doc_index[index]:
        ne = doc_index[index][entsign]
        if entsign not in word_index:
            word_index[entsign] = {'text': ne['NE-text'], 'type': ne['NE-type'], 'count-occurences-total': 0, 'count-documents': 0}
        word_index[entsign]['count-occurences-total'] += ne['NE-count']
        word_index[entsign]['count-documents'] += 1

word_df = pd.DataFrame(word_index.values())
pair_list = []
for index, row in doc_notxt_df.iterrows():
    for entsign in doc_index[index]:
        if word_index[entsign]['count-documents'] > 1:
            new_row = {**row, **doc_index[index][entsign]}
            pair_list.append(new_row)

pair_df = pd.DataFrame(pair_list)
word_df.to_csv(output_file_words, index=False, encoding='utf-8')
pair_df.to_csv(output_file_pairs, index=False, encoding='utf-8')
