In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp38-cp38-manylinux2014_x86_64.whl (98.6 MB)
[K     |████████████████████████████████| 98.6 MB 63 kB/s  eta 0:00:01
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 3.9 MB/s eta 0:00:01
Collecting plotly
  Downloading plotly-5.15.0-py2.py3-none-any.whl (15.5 MB)
[K     |████████████████████████████████| 15.5 MB 5.3 MB/s eta 0:00:01
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly, graphviz, catboost
Successfully installed catboost-1.2 graphviz-0.20.1 plotly-5.15.0 tenacity-8.2.2


In [5]:
! pip install Bio

Collecting Bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[K     |████████████████████████████████| 276 kB 6.6 MB/s eta 0:00:01
Collecting biopython>=1.80
  Downloading biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.9 MB/s eta 0:00:01
Collecting gprofiler-official
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biothings-client, mygene, gprofiler-official, biopython, Bio
Successfully installed Bio-1.5.9 biopython-1.81 biothings-client-0.3.0 gprofiler-official-1.0.0 mygene-3.2.2


In [13]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[K     |████████████████████████████████| 390 kB 6.0 MB/s eta 0:00:01
Collecting alembic>=1.5.0
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[K     |████████████████████████████████| 224 kB 6.6 MB/s eta 0:00:01
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting typing-extensions>=4
  Using cached typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting importlib-resources
  Downloading importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Installing collected packages: typing-extensions, importlib-resources, colorlog, cmaes, alembic, optuna
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4.3
    Uninstalling typing-extensions-3.7.4.3:
      Successfully uninstalled typing-extensions-3.7.4.3
  Attempting uninstall: alembic
    Found existing ins

In [6]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
def add_length(data): # 단백질 시퀀스의 길이 변수 추가
    data['length'] = data['epitope_seq'].str.len()
    return data

In [9]:
def add_letters_count(data): # epitope 시퀀스 구성 개수 체크
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    
    for char in letters:
        data[char] = data['epitope_seq'].str.count(char)
    return data

In [10]:
def remove_zero(data): # protein 표현시 사용하지 않는 알파벳 제거
    cross_check = 'BJOUXZ'
    for char in cross_check:
        if data[char].sum() ==0:
            data = data.drop([char], axis=1)
    return data

In [15]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

total = pd.concat([train,test])

In [19]:
totals = total[['epitope_seq','label']]
totals['epitope_seq'] = totals['epitope_seq'].str.upper()
totals = add_length(totals)
totals = add_letters_count(totals)
totals = remove_zero(totals)
totals = totals.drop(['epitope_seq'],axis=1)

In [20]:
total['disease_state'] = total['disease_state'].astype('str')
total['antigen_code'] = total['antigen_code'].astype('str')
total['disease_type'] = total['disease_type'].astype('str')
total['assay_method_technique'] = total['assay_method_technique'].astype('str')
total['assay_group'] = total['assay_group'].astype('str')

In [26]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
totals['disease_state'] = le.fit_transform(total['disease_state'])
totals['antigen_code'] = le.fit_transform(total['antigen_code'])
totals['disease_type'] = le.fit_transform(total['disease_type'])
totals['assay_method_technique'] = le.fit_transform(total['assay_method_technique'])
totals['assay_group'] = le.fit_transform(total['assay_group'])

In [27]:
train_feature = totals[:len(train)]
target = train_feature['label']
train_feature = train_feature.drop(['label'], axis=1)
test_feature = totals[len(train):]
test_feature = test_feature.drop(['label'],axis=1)

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train_feature, target,test_size=0.2,stratify = target)

In [29]:
model = CatBoostClassifier(verbose = False)
model.fit(X_train, Y_train)
result = model.predict_proba(test_feature)

In [30]:
result

array([[7.75621984e-01, 2.24378016e-01],
       [9.98048659e-01, 1.95134090e-03],
       [9.50948848e-01, 4.90511521e-02],
       ...,
       [6.71882658e-01, 3.28117342e-01],
       [9.99174334e-01, 8.25665848e-04],
       [9.82215433e-01, 1.77845672e-02]])

In [31]:
sub = pd.read_csv('./sample_submission.csv')

In [32]:
sub['label'] = np.where(result[:,1] >0.5,1,0)
sub

Unnamed: 0,id,label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
120939,120940,0
120940,120941,0
120941,120942,0
120942,120943,0


In [34]:
sub.to_csv('sub_protien.csv', index=None)