In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
import matplotlib.pyplot as plt

# 데이터셋 로드
file_path = '/Users/jangjaehoon/Desktop/hufs/2024-1/SNA/ND/preprocessing/newdata/smasemsc.csv'
data = pd.read_csv(file_path)


In [2]:
# 필요한 열 선택
selected_columns = ['PUBCHEM_EXT_DATASOURCE_SMILES', 'Sequence', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_ACTIVITY_SCORE']
data_selected = data[selected_columns]

# 열 이름 변경
data_selected.columns = ['PUBCHEM_EXT_DATASOURCE_SMILES', 'Sequence', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_ACTIVITY_SCORE']

# 타겟 변수 인코딩
label_mapping = {'Inactive': 0, 'Active': 1, 'Inconclusive': 2}
data_selected.loc[:, 'ActivityLabel'] = data_selected['PUBCHEM_ACTIVITY_OUTCOME'].map(label_mapping)

# 'smile'과 'sequence'를 하나의 피처로 결합
data_selected.loc[:, 'features'] = data_selected['PUBCHEM_EXT_DATASOURCE_SMILES'] + ' ' + data_selected['Sequence']




In [3]:
data_selected

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,Sequence,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,ActivityLabel,features
0,C1=CC=C(C=C1)CCC(=O)NC(=S)NC2=CC=CC=C2[N+](=O)...,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,C1=CC=C(C=C1)CCC(=O)NC(=S)NC2=CC=CC=C2[N+](=O)...
1,CC1=C(C=C(C=C1)NS(=O)(=O)C2=CC3=C(C=C2)NC(=O)C...,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,CC1=C(C=C(C=C1)NS(=O)(=O)C2=CC3=C(C=C2)NC(=O)C...
2,CC1=C(C=CC(=C1)F)S(=O)(=O)NCC(C2=CN=CC=C2)N3CC...,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,CC1=C(C=CC(=C1)F)S(=O)(=O)NCC(C2=CN=CC=C2)N3CC...
3,C1CCC(=NNC(=O)CN2C3C(NC(=O)N3)NC2=O)C1,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,C1CCC(=NNC(=O)CN2C3C(NC(=O)N3)NC2=O)C1 MSQLAHN...
4,C1=CC(=CC(=C1)C(F)(F)F)CNC(=O)CSCC(=O)O,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,C1=CC(=CC(=C1)C(F)(F)F)CNC(=O)CSCC(=O)O MSQLAH...
...,...,...,...,...,...,...
624299,CC(=O)N=C(N)SC1=NC2=CC=CC=C2N1,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inactive,0.0,0,CC(=O)N=C(N)SC1=NC2=CC=CC=C2N1 MAVESRSRVTSKLVK...
624300,COC1=C(C2=C(C=C1)C=NN(C2=O)C3=CC=C(C=C3)OC(F)(...,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inactive,0.0,0,COC1=C(C2=C(C=C1)C=NN(C2=O)C3=CC=C(C=C3)OC(F)(...
624301,CC1=CC=C(C=C1)C(=O)N/C(=C\C2=CC(=CC=C2)[N+](=O...,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inactive,0.0,0,CC1=CC=C(C=C1)C(=O)N/C(=C\C2=CC(=CC=C2)[N+](=O...
624302,COC1=CC=C(C=C1)/C=C/C(=O)NCCCN2C=CN=C2,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inactive,0.0,0,COC1=CC=C(C=C1)/C=C/C(=O)NCCCN2C=CN=C2 MAVESRS...


In [4]:
data_selected['ActivityLabel'].value_counts()

ActivityLabel
0    616117
2      6538
1      1649
Name: count, dtype: int64

In [5]:
from sklearn.utils import resample

# 클래스별 샘플 수 계산
inactive = data_selected[data_selected['ActivityLabel'] == 0]
active = data_selected[data_selected['ActivityLabel'] == 1]
inconclusive = data_selected[data_selected['ActivityLabel'] == 2]

# 단백질 유형 분류
protein2in = inactive[inactive['Sequence'].str.contains('MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKYSSRTEFRDKTDYIMYNPRPRDEPSSENPVSVSPLLCELAAARSRIHFNPTETTIGIVTCGGICPGLNDVIRSITLTGINVYNVKRVIGFRFGYWGLSKKGSQTAIELHRGRVTNIHHYGGTILGSSRGPQDPKEMVDTLERLGVNILFTVGGDGTQRGALVISQEAKRRGVDISVFGVPKTIDNDLSFSHRTFGFQTAVEKAVQAIRAAYAEAVSANYGVGVVKLMGRDSGFIAAQAAVASAQANICLVPENPISEQEVMSLLERRFCHSRSCVIIVAEGFGQDWGRGSGGYDASGNKKLIDIGVILTEKVKAFLKANKSRYPDSTVKYIDPSYMIRACPPSANDALFCATLATLAVHEAMAGATGCIIAMRHNNYILVPIKVATSVRRVLDLRGQLWRQVREITVDLGSDVRLARKLEIRRELEAINRNRDRLHEELAKL')]
protein1in = inactive[inactive['Sequence'].str.contains('MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGMSVARMNFSHGSHEYHQTTINNVRQAAAELGVNIAIALDTKGPEIRTGQFVGGDAVMERGATCYVTTDPAFADKGTKDKFYIDYQNLSKVVRPGNYIYIDDGILILQVQSHEDEQTLECTVTNSHTISDRRGVNLPGCDVDLPAVSAKDRVDLQFGVEQGVDMIFASFIRSAEQVGDVRKALGPKGRDIMIICKIENHQGVQNIDSIIEESDGIMVARGDLGVEIPAEKVVVAQKILISKCNVAGKPVICATQMLESMTYNPRPTRAEVSDVANAVFNGADCVMLSGETAKGKYPNEVVQYMARICLEAQSALNEYVFFNSIKKLQHIPMSADEAVCSSAVNSVYETKAKAMVVLSNTGRSARLVAKYRPNCPIVCVTTRLQTCRQLNITQGVESVFFDADKLGHDEGKEHRVAAGVEFAKSKGYVQTGDYCVVIHADHKVKGYANQTRILLVE')]

protein2a = active[active['Sequence'].str.contains('MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKYSSRTEFRDKTDYIMYNPRPRDEPSSENPVSVSPLLCELAAARSRIHFNPTETTIGIVTCGGICPGLNDVIRSITLTGINVYNVKRVIGFRFGYWGLSKKGSQTAIELHRGRVTNIHHYGGTILGSSRGPQDPKEMVDTLERLGVNILFTVGGDGTQRGALVISQEAKRRGVDISVFGVPKTIDNDLSFSHRTFGFQTAVEKAVQAIRAAYAEAVSANYGVGVVKLMGRDSGFIAAQAAVASAQANICLVPENPISEQEVMSLLERRFCHSRSCVIIVAEGFGQDWGRGSGGYDASGNKKLIDIGVILTEKVKAFLKANKSRYPDSTVKYIDPSYMIRACPPSANDALFCATLATLAVHEAMAGATGCIIAMRHNNYILVPIKVATSVRRVLDLRGQLWRQVREITVDLGSDVRLARKLEIRRELEAINRNRDRLHEELAKL')]
protein1a = active[active['Sequence'].str.contains('MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGMSVARMNFSHGSHEYHQTTINNVRQAAAELGVNIAIALDTKGPEIRTGQFVGGDAVMERGATCYVTTDPAFADKGTKDKFYIDYQNLSKVVRPGNYIYIDDGILILQVQSHEDEQTLECTVTNSHTISDRRGVNLPGCDVDLPAVSAKDRVDLQFGVEQGVDMIFASFIRSAEQVGDVRKALGPKGRDIMIICKIENHQGVQNIDSIIEESDGIMVARGDLGVEIPAEKVVVAQKILISKCNVAGKPVICATQMLESMTYNPRPTRAEVSDVANAVFNGADCVMLSGETAKGKYPNEVVQYMARICLEAQSALNEYVFFNSIKKLQHIPMSADEAVCSSAVNSVYETKAKAMVVLSNTGRSARLVAKYRPNCPIVCVTTRLQTCRQLNITQGVESVFFDADKLGHDEGKEHRVAAGVEFAKSKGYVQTGDYCVVIHADHKVKGYANQTRILLVE')]

protein2c = inconclusive[inconclusive['Sequence'].str.contains('MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKYSSRTEFRDKTDYIMYNPRPRDEPSSENPVSVSPLLCELAAARSRIHFNPTETTIGIVTCGGICPGLNDVIRSITLTGINVYNVKRVIGFRFGYWGLSKKGSQTAIELHRGRVTNIHHYGGTILGSSRGPQDPKEMVDTLERLGVNILFTVGGDGTQRGALVISQEAKRRGVDISVFGVPKTIDNDLSFSHRTFGFQTAVEKAVQAIRAAYAEAVSANYGVGVVKLMGRDSGFIAAQAAVASAQANICLVPENPISEQEVMSLLERRFCHSRSCVIIVAEGFGQDWGRGSGGYDASGNKKLIDIGVILTEKVKAFLKANKSRYPDSTVKYIDPSYMIRACPPSANDALFCATLATLAVHEAMAGATGCIIAMRHNNYILVPIKVATSVRRVLDLRGQLWRQVREITVDLGSDVRLARKLEIRRELEAINRNRDRLHEELAKL')]
protein1c = inconclusive[inconclusive['Sequence'].str.contains('MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGMSVARMNFSHGSHEYHQTTINNVRQAAAELGVNIAIALDTKGPEIRTGQFVGGDAVMERGATCYVTTDPAFADKGTKDKFYIDYQNLSKVVRPGNYIYIDDGILILQVQSHEDEQTLECTVTNSHTISDRRGVNLPGCDVDLPAVSAKDRVDLQFGVEQGVDMIFASFIRSAEQVGDVRKALGPKGRDIMIICKIENHQGVQNIDSIIEESDGIMVARGDLGVEIPAEKVVVAQKILISKCNVAGKPVICATQMLESMTYNPRPTRAEVSDVANAVFNGADCVMLSGETAKGKYPNEVVQYMARICLEAQSALNEYVFFNSIKKLQHIPMSADEAVCSSAVNSVYETKAKAMVVLSNTGRSARLVAKYRPNCPIVCVTTRLQTCRQLNITQGVESVFFDADKLGHDEGKEHRVAAGVEFAKSKGYVQTGDYCVVIHADHKVKGYANQTRILLVE')]



In [6]:
# 언더샘플링
underprotein1 = resample(protein1in, replace=False, n_samples= (len(active) + len(inconclusive))//2, random_state=42)
underprotein2 = resample(protein2in, replace=False, n_samples= (len(active) + len(inconclusive))//2, random_state=42)

# underprotein1 = resample(protein1in, replace=False, n_samples= (len(active) + len(inconclusive))//2 + 221 , random_state=42)
# underprotein2 = resample(protein2in, replace=False, n_samples= (len(active) + len(inconclusive))//2 + 221 , random_state=42)

# 오버샘플링
# overprotein1a = resample(protein1a, replace=True, n_samples=3004 + 221, random_state=42)
# overprotein2a = resample(protein2a, replace=True, n_samples=3533 + 221, random_state=42)
# overprotein1c = resample(protein1c, replace=True, n_samples=2090, random_state=42)
# overprotein2c = resample(protein2c, replace=True, n_samples=len(underprotein2), random_state=42)

# # 다른 클래스 샘플 유지
inconclusive_samples = data_selected[data_selected['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inconclusive']
active_samples = data_selected[data_selected['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active']

In [7]:
balanced_data = pd.concat([underprotein1,underprotein2,active_samples,inconclusive_samples])

In [8]:
balanced_data

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,Sequence,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,ActivityLabel,features
152188,CN1C(=CC(=NS1(=O)=O)C2=CC=CS2)C(=O)NC3=CC=C(C=...,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,CN1C(=CC(=NS1(=O)=O)C2=CC=CS2)C(=O)NC3=CC=C(C=...
70469,CC(=O)NC(CCS(=O)(=O)C)C(=O)NC1=CC=CC=C1C(=O)OC,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,CC(=O)NC(CCS(=O)(=O)C)C(=O)NC1=CC=CC=C1C(=O)OC...
137845,CN1C(=C(C(=N1)C(F)(F)F)C2=CC=CC=C2)NC(=O)CSCC(...,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,CN1C(=C(C(=N1)C(F)(F)F)C2=CC=CC=C2)NC(=O)CSCC(...
56640,CC1=CC=CC=C1N(C(=O)C)/N=C\2/C(=O)C3=CC=CC=C3S2,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,CC1=CC=CC=C1N(C(=O)C)/N=C\2/C(=O)C3=CC=CC=C3S2...
54759,C1=CC=C2C(=C1)C(=CC(=N2)C3=CC=CS3)C(=O)N/N=C/C...,MSQLAHNLTLSIFDPVANYRAARIICTIGPSTQSVEALKGLIQSGM...,Inactive,0.0,0,C1=CC=C2C(=C1)C(=CC(=N2)C3=CC=CS3)C(=O)N/N=C/C...
...,...,...,...,...,...,...
623961,CC1CCN(CC1)C(=O)COC2=CC3=C(C=C2)C4=C(C=C(C=C4)...,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inconclusive,20.0,2,CC1CCN(CC1)C(=O)COC2=CC3=C(C=C2)C4=C(C=C(C=C4)...
624036,COC1=C(C=C(C=C1)C2=NN(C=C2C(=O)NC3=CC=CC(=C3)C...,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inconclusive,20.0,2,COC1=C(C=C(C=C1)C2=NN(C=C2C(=O)NC3=CC=CC(=C3)C...
624131,C1CCCN(CCC1)CC(=O)N/N=C/C2=CC=C(O2)[N+](=O)[O-],MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inconclusive,20.0,2,C1CCCN(CCC1)CC(=O)N/N=C/C2=CC=C(O2)[N+](=O)[O-...
624172,C1=CC(=CC=C1OCC(=O)NC2=CC3=C(C=C2)NN=C3)Cl,MAVESRSRVTSKLVKAHRAMLNSVTQEDLKVDRLPGADYPNPSKKY...,Inconclusive,20.0,2,C1=CC(=CC=C1OCC(=O)NC2=CC3=C(C=C2)NN=C3)Cl MAV...


In [9]:
df = balanced_data

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Calculate weights based on PUBCHEM_ACTIVITY_SCORE
scores = df['PUBCHEM_ACTIVITY_SCORE'].values

# Normalize weights to be between 0 and 1
scaler = MinMaxScaler()
weights = scaler.fit_transform(scores.reshape(-1, 1)).flatten()

# Add weights to the dataframe
df['Weight'] = weights

In [12]:
import numpy as np
weights = np.where(weights == 0, 1e-2, df['Weight'].values)
df['Weight'] = weights

In [13]:

# 데이터 분할 7:2:1 비율
X = df[['features', 'Weight']]
y = df['ActivityLabel']

# 먼저 학습(train)과 임시 데이터셋(30%)으로 나눔
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

# 임시 데이터셋을 검증(validation)과 테스트(test)로 나눔
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(1/3), random_state=42,stratify=y_temp)

# CountVectorizer 생성
vectorizer = CountVectorizer()

# 학습 데이터 벡터화
X_train_vectorized = vectorizer.fit_transform(X_train['features'])
X_val_vectorized = vectorizer.transform(X_val['features'])
X_test_vectorized = vectorizer.transform(X_test['features'])


In [14]:
from sklearn.metrics import accuracy_score, log_loss, f1_score, roc_auc_score, recall_score, precision_score, classification_report


In [15]:
# Naive Bayes 모델 초기화 및 학습
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# 검증 세트에 대한 예측 수행 및 평가
y_val_pred = model.predict(X_val_vectorized)
y_val_prob = model.predict_proba(X_val_vectorized)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_loss = log_loss(y_val, y_val_prob)
val_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Loss: {val_loss}')
print('Validation Classification Report:')
print(val_report)


Validation Accuracy: 0.5778863775198534
Validation Loss: 0.894511788608153
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.71      0.66      1637
           1       0.36      0.09      0.14       330
           2       0.53      0.54      0.54      1307

    accuracy                           0.58      3274
   macro avg       0.50      0.45      0.45      3274
weighted avg       0.56      0.58      0.56      3274



In [16]:

# 테스트 세트에 대한 예측 수행 및 평가
y_test_pred = model.predict(X_test_vectorized)
y_test_prob = model.predict_proba(X_test_vectorized)[:, 1]

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
test_auc = roc_auc_score(y_test, model.predict_proba(X_test_vectorized), multi_class='ovr')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print(f'Test F1 Score: {test_f1}')
print(f'Test AUC: {test_auc}')
print(f'Test Recall: {test_recall}')
print(f'Test Precision: {test_precision}')
print('Test Classification Report:')
print(test_report)

Test Accuracy: 0.5805860805860806
Test F1 Score: 0.5572703366391291
Test AUC: 0.6968575605387467
Test Recall: 0.5805860805860806
Test Precision: 0.5537705684822962
Test Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.72      0.67       819
           1       0.29      0.05      0.09       165
           2       0.54      0.54      0.54       654

    accuracy                           0.58      1638
   macro avg       0.48      0.44      0.43      1638
weighted avg       0.55      0.58      0.56      1638

