## 라이브러리 설치

In [None]:
# torch, torch-geometric 설치
!pip install torch
!pip install torch-geometric

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 데이터 생성 및 정제

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, LabelEncoder
import torch
import numpy as np
import random
from sklearn.decomposition import PCA
import torch.nn as nn

data_path = "/content/drive/MyDrive/졸업과제/"

<torch._C.Generator at 0x78c59af8b3b0>

In [None]:
# 랜덤 시드 설정 - 고정
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# 데이터 로드
disease = pd.read_csv(data_path + "disease_associations.csv")

# 불필요한 열 제거 및 결측치 처리
del disease["diseaseName"]
disease['diseaseClass'] = disease["diseaseClass"].fillna("").str.split(";")
disease = disease.fillna("None")

# "diseaseClass" one-hot encoding
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(disease['diseaseClass'])

# PCA로 차원 축소
pca = PCA(n_components=2)
disease_class_feature = pca.fit_transform(one_hot_encoded)
disease_class_feature = pd.DataFrame(disease_class_feature)

# 다른 features Label Encoding 처리
features = ["diseaseType", "diseaseSemanticType"]
for feature in features:
    le = LabelEncoder()
    le = le.fit(disease[feature])
    disease[feature] = le.transform(disease[feature])

# diseaseId 제거
disease_id = disease["diseaseId"]
del disease["diseaseId"]
del disease['diseaseClass']

# PCA로 변환된 데이터와 병합
disease = pd.concat([disease, disease_class_feature], axis=1)

# 정규화 추가
scaler = StandardScaler()
disease_features = scaler.fit_transform(disease.values)

# Tensor 변환 및 선형 레이어 적용
data = torch.Tensor(disease_features)
linear_layer = nn.Linear(data.shape[1], 32)

transformed_disease = linear_layer(data)
print(transformed_disease.shape)
transformed_disease

torch.Size([30170, 32])


tensor([[ 1.7799,  0.4415,  0.4455,  ..., -0.8268,  0.5288, -0.0216],
        [ 1.7306,  0.2946,  0.3218,  ..., -0.5721,  0.2983,  0.0983],
        [ 0.7537,  0.0370,  0.4281,  ..., -0.0061, -0.1724,  0.2426],
        ...,
        [ 0.4854,  1.2313,  0.2207,  ...,  0.5290,  0.3875, -0.3897],
        [ 0.4340,  0.4871, -0.4240,  ...,  0.9361, -0.4820,  0.0177],
        [-0.1359,  0.1914, -0.3831,  ...,  0.7107, -0.5230, -0.0368]],
       grad_fn=<AddmmBackward0>)

In [None]:
# 랜덤 시드 설정 - 고정
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# 데이터 로드
gene = pd.read_csv(data_path + "gene_associations.csv")

# 불필요한 열 제거
del gene["geneSymbol"], gene["protein_class_name"], gene["protein_class"]

# geneId를 문자열로 변환
gene["geneId"] = gene["geneId"].apply(str)

# "DSI", "DPI", "PLI" 결측치가 있는 행 제거
gene = gene.dropna(subset=["DSI", "DPI", "PLI"]).reset_index(drop=True)

# geneId 열 제거
gene_id = gene["geneId"]
del gene["geneId"]

# 정규화 추가 (StandardScaler)
scaler = StandardScaler()
gene_features = scaler.fit_transform(gene.values)

data = torch.Tensor(gene_features)
linear_layer = nn.Linear(data.shape[1], 32)

transformed_gene = linear_layer(data)
print(transformed_gene.shape)
transformed_gene

torch.Size([15576, 32])


tensor([[ 0.4896, -0.2769, -0.2458,  ..., -0.1716, -0.1194, -0.0749],
        [ 0.8209, -0.2826, -0.4765,  ...,  0.2271, -0.6244,  0.5758],
        [ 0.8902, -0.4218, -0.4196,  ...,  0.2182, -0.5978,  0.5917],
        ...,
        [ 0.1807,  0.2722,  0.2255,  ..., -0.4707,  0.4644, -1.0204],
        [ 0.7946, -0.4395, -0.3942,  ...,  0.1369, -0.5027,  0.4575],
        [ 0.2656,  0.1882,  0.1700,  ..., -0.5075,  0.4421, -1.0164]],
       grad_fn=<AddmmBackward0>)

In [None]:
# gda
gda_0 = pd.read_csv(data_path + "gda_0.csv")
gda_1 = pd.read_csv(data_path + "gda_1.csv")
gda_2 = pd.read_csv(data_path + "gda_2.csv")
gda_3 = pd.read_csv(data_path + "gda_3.csv")
df_list = [gda_0, gda_1, gda_2, gda_3]
gda = pd.concat(df_list, ignore_index=True)

gda["geneNcbiID"] = gda["geneNcbiID"].apply(str)
gda = gda[gda['geneNcbiID'].isin(gene_id)]
gda = gda[gda['diseaseUMLSCUI'].isin(disease_id)]

gda = gda.drop_duplicates(["geneNcbiID", "diseaseUMLSCUI"])
gda.columns = ["geneId", "diseaseId"]

gda.head()

Unnamed: 0,geneId,diseaseId
0,1,C0019209
1,1,C0036341
2,2,C0002395
3,2,C0011570
5,2,C0027726


In [None]:
# id와 병합
disease = pd.concat([disease_id, pd.DataFrame(transformed_disease.detach().numpy())], axis=1)
gene = pd.concat([gene_id, pd.DataFrame(transformed_gene.detach().numpy())], axis=1)

In [None]:
# dataframe -> csv
gene.to_csv("gene_processed.csv", index=False)
disease.to_csv("disease_processed.csv", index=False)
gda.to_csv("gda_processed.csv", index=False)