In [1]:
import numpy as np
import pandas as pd

# Данные первой таблицы
class_data = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'class': [0, 1, 1, 0, 0, None]  # None обозначает неизвестные классы
})

# Данные второй таблицы
similarity_data = pd.DataFrame({
    'id1': [1, 2, 1, 5, 4, 3],
    'id2': [2, 3, 6, 6, 5, 4],
    'sim': [0.8, 0.9, -0.7, -0.5, 0.3, -0.6]
})

# Функция для распространения меток
def propagate_labels_df(class_data, similarity_data, max_iter=100, tol=1e-5):
    # Копируем данные, чтобы не изменять оригинальные DataFrame
    class_data = class_data.copy()
    similarity_data = similarity_data.copy()
    
    # Заполняем неизвестные классы значением 0.5 (начальная неопределённость)
    class_data['class'] = class_data['class'].fillna(0.5)
    
    # Итеративное обновление
    for iteration in range(max_iter):
        prev_classes = class_data['class'].copy()
        max_change = 0
        
        for node in class_data['id']:
            # Пропускаем узлы с фиксированными метками
            if not pd.isna(class_data.loc[class_data['id'] == node, 'class'].iloc[0]) and \
               class_data.loc[class_data['id'] == node, 'class'].iloc[0] in [0, 1]:
                continue
            
            # Найти соседей узла и их веса
            neighbors = similarity_data[
                (similarity_data['id1'] == node) | (similarity_data['id2'] == node)
            ].copy()
            neighbors['neighbor'] = np.where(
                neighbors['id1'] == node, neighbors['id2'], neighbors['id1']
            )
            
            # Считаем взвешенное среднее классов соседей
            weighted_sum = 0
            total_weight = 0
            for _, row in neighbors.iterrows():
                neighbor = row['neighbor']
                weight = row['sim']
                neighbor_class = class_data.loc[class_data['id'] == neighbor, 'class'].iloc[0]
                weighted_sum += weight * neighbor_class
                total_weight += abs(weight)
            
            # Обновляем класс узла
            if total_weight > 0:
                new_class = weighted_sum / total_weight
            else:
                new_class = class_data.loc[class_data['id'] == node, 'class'].iloc[0]
            
            # Обновляем класс в датафрейме
            class_data.loc[class_data['id'] == node, 'class'] = new_class
            
            # Отслеживаем максимальное изменение
            max_change = max(max_change, abs(new_class - prev_classes.loc[class_data['id'] == node].iloc[0]))
        
        # Проверяем сходимость
        if max_change < tol:
            break
    
    return class_data


# Применяем функцию
resulting_classes = propagate_labels_df(class_data, similarity_data)
resulting_classes


Unnamed: 0,id,class
0,1,0.0
1,2,1.0
2,3,1.0
3,4,0.0
4,5,0.0
5,6,0.0


In [2]:
train = pd.read_csv("train.csv")
sim =  pd.read_csv("sim.csv")

In [3]:
sim.isna().sum()

id1    0
id2    0
sim    0
dtype: int64

In [4]:
train.isna().sum()

id       0
class    0
dtype: int64

In [5]:
pred_classes = {'id': np.arange(10000, 15000),
                'class': [np.nan] * 5000}
pred_classes = pd.DataFrame(pred_classes)
full = pd.concat([train, pred_classes])

In [6]:
full

Unnamed: 0,id,class
0,0,D
1,1,K
2,2,D
3,3,K
4,4,D
...,...,...
4995,14995,
4996,14996,
4997,14997,
4998,14998,


In [7]:
full.replace({'D': 0, 'K': 1})

  full.replace({'D': 0, 'K': 1})


Unnamed: 0,id,class
0,0,0.0
1,1,1.0
2,2,0.0
3,3,1.0
4,4,0.0
...,...,...
4995,14995,
4996,14996,
4997,14997,
4998,14998,


In [8]:
predict = propagate_labels_df(full.replace({'D': 0, 'K': 1}), sim, max_iter=1000)
predict

  predict = propagate_labels_df(full.replace({'D': 0, 'K': 1}), sim, max_iter=1000)


Unnamed: 0,id,class
0,0,0.000000
1,1,1.000000
2,2,0.000000
3,3,1.000000
4,4,0.000000
...,...,...
4995,14995,-0.119728
4996,14996,0.182306
4997,14997,0.047020
4998,14998,0.051158


In [64]:
pred = predict[predict['id'] >= 10000].copy()
pred['class'] = pred['class'].apply(lambda x: x > 0)
pred.replace({False: 'D', True: 'K'}).to_csv('subm3.csv', index=False)

# 2
