In [7]:
import requests, csv, time, json
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

# Парсинг

In [8]:
# openalex
URL = 'https://api.openalex.org/works'

PARAMS = {
    'filter': 'concepts.id:https://openalex.org/C41008148,from_publication_date:1980-01-01',
    'per_page': 200,
    'cursor': '*'
}

FIELDS = [
    'Year_of_Publication',
    'Title_Length_in_Characters',
    'Title_Word_Count',
    'Number_of_Authors',
    'Number_of_Institutions_of_First_Author',
    'Number_of_Citations',
    'Number_of_References',
    'Type_of_Publication',
    'Language_of_Publication',
    'Primary_Concept_of_Publication',
    'Is_Open_Access',
    'Open_Access_Type',
    'Abstract_Word_Count',
    'Concepts_Count',
    'Related_Works_Count',
    'Locations_Count',
    'OA_Locations_Count',
    'Has_DOI',
    'Institutions_All_Authors_Count',
    'Citations_Per_Year',
]

with open('data.tsv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(FIELDS)
    total_rows = 0
    limit_reached = False
    current_year = time.gmtime().tm_year

    while total_rows < 500 and not limit_reached:
        r = requests.get(URL, params=PARAMS, timeout=60)
        data = r.json()

        for w in data['results']:
            authorships = w.get('authorships') or []
            first_author_count = 0
            if authorships:
                first = authorships[0]
                insts = first.get('institutions', []) or []
                first_author_count = len(insts)

            oa_info = w.get('open_access', {}) or {}
            title_text = w.get('title') or ''
            title_word_count = len(title_text.split()) if title_text else 0

            abstract_inv = w.get('abstract_inverted_index') or {}
            abstract_wcnt = sum(len(v) for v in abstract_inv.values()) if isinstance(abstract_inv, dict) else 0
            concepts_cnt = len(w.get('concepts') or [])
            related_cnt = len(w.get('related_works') or [])
            locations_cnt = w.get('locations_count') or 0
            oa_locations_cnt = sum(1 for loc in (w.get('locations') or []) if loc and loc.get('is_oa'))
            has_doi = 1 if w.get('doi') else 0
            insts_all_cnt = sum(len(a.get('institutions') or []) for a in authorships)

            pub_year = w.get('publication_year') or current_year
            years_since = max(1, (current_year - pub_year + 1))
            citations_per_year = (w.get('cited_by_count') or 0) / years_since

            row = [
                w.get('publication_year'),
                len(title_text),
                title_word_count,
                len(authorships),
                first_author_count,
                w.get('cited_by_count'),
                w.get('referenced_works_count'),
                w.get('type'),
                w.get('language'),
                (w['concepts'][0]['display_name'] if w.get('concepts') else None),
                oa_info.get('is_oa'),
                oa_info.get('oa_status'),
                abstract_wcnt,
                concepts_cnt,
                related_cnt,
                locations_cnt,
                oa_locations_cnt,
                has_doi,
                insts_all_cnt,
                citations_per_year,
            ]
            writer.writerow(row)
            total_rows += 1
            if total_rows >= 500:
                limit_reached = True
                break

        print(f'Собрано {total_rows} статей')

        if not limit_reached:
            cursor = data['meta'].get('next_cursor')
            if not cursor:
                break
            PARAMS['cursor'] = cursor

        time.sleep(1.5)

print(f'Выгрузка завершена. Всего статей: {total_rows}')

Собрано 200 статей
Собрано 400 статей
Собрано 400 статей
Собрано 500 статей
Собрано 500 статей
Выгрузка завершена. Всего статей: 500
Выгрузка завершена. Всего статей: 500


# Типизация

In [9]:
INPUT_FILE = "data.tsv"
OUTPUT_JSON = "data.json"
OUTPUT_ARFF = "data.arff"

TYPE_MAP = {
    "Year_of_Publication": "integer",
    "Title_Length_in_Characters": "integer",
    "Title_Word_Count": "integer",
    "Number_of_Authors": "integer",
    "Number_of_Institutions_of_First_Author": "integer",
    "Number_of_Citations": "integer",
    "Number_of_References": "integer",
    "Type_of_Publication": "category",
    "Language_of_Publication": "category",
    "Primary_Concept_of_Publication": "text",
    "Is_Open_Access": "category",
    "Open_Access_Type": "category",
    "Abstract_Word_Count": "integer",
    "Concepts_Count": "integer",
    "Related_Works_Count": "integer",
    "Locations_Count": "integer",
    "OA_Locations_Count": "integer",
    "Has_DOI": "integer",
    "Institutions_All_Authors_Count": "integer",
    "Citations_Per_Year": "numeric",
}

data_list = []
with open(INPUT_FILE, encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")

    header_fields = reader.fieldnames or []
    fields = [(name, TYPE_MAP[name]) for name in header_fields if name in TYPE_MAP]
    unique_categories = {name: set() for name, t in fields if t == "category"}

    for row in reader:
        item = {}
        for name, typ in fields:
            raw_value = row.get(name, "")
            if isinstance(raw_value, str):
                value = raw_value.strip()
                value = None if value == "" else value
            else:
                value = raw_value

            if typ in ("integer", "numeric"):
                try:
                    if typ == "integer":
                        item[name] = int(float(value)) if value is not None else None
                    else:
                        item[name] = float(value) if value is not None else None
                except Exception:
                    item[name] = None
            elif typ == "category":
                item[name] = value
                if value is not None:
                    unique_categories[name].add(str(value))
            else:
                item[name] = value
        data_list.append(item)

header = []
for name, typ in fields:
    h = {"feature_name": name, "type": typ}
    if typ == "category":
        h["values"] = sorted(unique_categories.get(name, set()))
    header.append(h)

output_json = {"header": header, "data": data_list}
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(output_json, f, ensure_ascii=False, indent=2)
print(f"JSON сохранён: {len(data_list)} записей")

with open(OUTPUT_ARFF, "w", encoding="utf-8") as f:
    f.write("@RELATION OpenAlex_Computer_Science\n\n")
    for name, typ in fields:
        if typ == "integer":
            f.write(f"@ATTRIBUTE {name} NUMERIC\n")
        elif typ == "numeric":
            f.write(f"@ATTRIBUTE {name} NUMERIC\n")
        elif typ == "text":
            f.write(f"@ATTRIBUTE {name} STRING\n")
        elif typ == "category":
            vals = ",".join(sorted(unique_categories.get(name, set())))
            f.write(f"@ATTRIBUTE {name} {{{vals}}}\n")
    f.write("\n@DATA\n")
    for row in data_list:
        row_str = []
        for name, typ in fields:
            val = row.get(name)
            if typ == "text":
                val = f'"{val}"' if val else "?"
            elif typ in ("integer", "numeric"):
                val = val if val is not None else "?"
            elif typ == "category":
                val = val if val else "?"
            row_str.append(str(val))
        f.write(",".join(row_str) + "\n")
print(f"ARFF сохранён: {len(data_list)} записей")


JSON сохранён: 500 записей
ARFF сохранён: 500 записей


# Финальный CSV

In [10]:
INPUT_JSON = 'data.json'
OUTPUT_CSV = 'data.csv'
TARGET_COLUMN = 'Primary_Concept_of_Publication'

with open(INPUT_JSON, encoding='utf-8') as f:
    dataset = json.load(f)

df = pd.DataFrame(dataset['data'])

num_cols = [h['feature_name'] for h in dataset['header'] if h['type'] in ('integer', 'numeric')]
cat_cols = [h['feature_name'] for h in dataset['header'] if h['type'] == 'category']
date_cols = ['Full_Publication_Date_YYYY_MM_DD']

if TARGET_COLUMN in cat_cols:
    cat_cols.remove(TARGET_COLUMN)

for col in ([TARGET_COLUMN] if TARGET_COLUMN in df.columns else []) + cat_cols:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
        df[col] = df[col].replace('', pd.NA)

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].median())

for col in list(cat_cols):
    if df[col].isnull().all():
        df[col] = df[col].fillna('__missing__')
    else:
        df[col] = df[col].fillna(df[col].mode(dropna=True)[0])

if 'Is_Open_Access' in df.columns:
    df['Is_Open_Access'] = df['Is_Open_Access'].map({True: 1, False: 0, 'True': 1, 'False': 0, 1: 1, 0: 0})
    if df['Is_Open_Access'].isnull().all():
        df['Is_Open_Access'] = 0
    else:
        df['Is_Open_Access'] = df['Is_Open_Access'].fillna(df['Is_Open_Access'].mode(dropna=True)[0])
    df['Is_Open_Access'] = df['Is_Open_Access'].astype(int)
    if 'Is_Open_Access' in cat_cols:
        cat_cols.remove('Is_Open_Access')

df = pd.get_dummies(df, columns=cat_cols, drop_first=False, dtype=int)

cols_with_trailing_underscore = [c for c in df.columns if c.endswith('_')]
if cols_with_trailing_underscore:
    df = df.drop(columns=cols_with_trailing_underscore, errors='ignore')

df = df.drop(columns=date_cols, errors='ignore')
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8')
print(f'Предобработка завершена. Данные сохранены в {OUTPUT_CSV}')

Предобработка завершена. Данные сохранены в data.csv
