<a href="https://colab.research.google.com/github/siji3328/OligoDesign/blob/main/seq_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
!pip install viennarna
!pip install pandas openpyxl
import RNA
from google.colab import files
import pandas as pd
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex

# 엑셀 파일 업로드
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_excel(file_name)

# E. coli 코돈 사용 빈도표
ecoli_codon_usage = {
    "TTT": 0.58, "TTC": 1.00, "TTA": 0.21, "TTG": 0.20,
    "CTT": 0.29, "CTC": 0.30, "CTA": 0.08, "CTG": 0.49,
    "ATT": 0.93, "ATC": 1.00, "ATA": 0.15, "ATG": 1.00,
    "GTT": 0.35, "GTC": 0.44, "GTA": 0.14, "GTG": 1.00,
    "TCT": 0.43, "TCC": 0.47, "TCA": 0.13, "TCG": 0.18,
    "CCT": 0.61, "CCC": 0.68, "CCA": 0.14, "CCG": 0.39,
    "ACT": 0.54, "ACC": 1.00, "ACA": 0.30, "ACG": 0.40,
    "GCT": 0.53, "GCC": 1.00, "GCA": 0.28, "GCG": 0.35,
    "TAT": 0.56, "TAC": 1.00, "CAT": 0.58, "CAC": 1.00,
    "CAA": 0.29, "CAG": 1.00, "AAT": 0.42, "AAC": 1.00,
    "AAA": 0.73, "AAG": 1.00, "GAT": 0.70, "GAC": 1.00,
    "GAA": 0.76, "GAG": 1.00, "TGT": 0.51, "TGC": 1.00,
    "TGG": 1.00, "CGT": 0.36, "CGC": 0.67, "CGA": 0.11,
    "CGG": 0.32, "AGT": 0.46, "AGC": 1.00, "AGA": 0.15,
    "AGG": 0.13, "GGT": 0.44, "GGC": 1.00, "GGA": 0.30,
    "GGG": 0.40
}

# 계산 함수
def calculate_gc_content(seq):
    return round((seq.count('G') + seq.count('C')) / len(seq) * 100, 2)

def calculate_cai(seq):
    cai_calc = CodonAdaptationIndex()
    cai_calc.set_cai_index(ecoli_codon_usage)
    trimmed_seq = seq[:len(seq) // 3 * 3]
    return round(cai_calc.cai_for_gene(trimmed_seq), 3)

def calculate_accessibility(seq):
    seq = seq.replace('T', 'U')[:50]  # DNA → RNA 변환 후 50bp 사용
    if len(seq) < 3:
        return 0.5  # 기본값
    fc = RNA.fold_compound(seq)
    mfe_structure, mfe = fc.mfe()
    mfe = float(mfe)  # 문자열을 실수로 변환
    return round(1 - (abs(mfe) / 100), 3)  # 접근성 계산 (MFE 기반)

# 점수 부여 함수
def score(value, thresholds, scores):
    for threshold, score in zip(thresholds, scores):
        if value >= threshold:
            return score
    return scores[-1]

# 데이터 처리
thresholds_cai = [0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]
thresholds_gc = [50, 45, 40, 35, 30, 25, 20, 15, 10, 5]
thresholds_accessibility = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0]
scores = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]



cai_values, gc_values, accessibility_values = [], [], []
cai_scores, gc_scores, accessibility_scores = [], [], []
for seq in df['seq']:
    gc_content = calculate_gc_content(seq)
    cai = calculate_cai(seq)
    accessibility = calculate_accessibility(seq)

    # 실제 값 저장
    gc_values.append(gc_content)
    cai_values.append(cai)
    accessibility_values.append(accessibility)

    # 점수 계산
    cai_scores.append(score(cai, thresholds_cai, scores))
    gc_scores.append(score(gc_content, thresholds_gc, scores))
    accessibility_scores.append(score(accessibility, thresholds_accessibility, scores))

# 실제 값과 점수 추가
df['GC_Value'] = gc_values
df['CAI_Value'] = cai_values
df['Accessibility_Value'] = accessibility_values

df['GC_Score'] = gc_scores
df['CAI_Score'] = cai_scores
df['Accessibility_Score'] = accessibility_scores

# 가중치 적용 후 최종 점수 계산
CAI_WEIGHT, ACCESSIBILITY_WEIGHT, GC_WEIGHT = 2, 2, 1
df['Final_Score'] = (
    df['CAI_Score'] * CAI_WEIGHT +
    df['Accessibility_Score'] * ACCESSIBILITY_WEIGHT +
    df['GC_Score'] * GC_WEIGHT
)

# 소수점 두 자리로 포맷팅
df['Final_Score'] = df['Final_Score'].apply(lambda x: round(x, 2))

# 결과 저장
output_file = "final_scores_with_values.xlsx"
df.to_excel(output_file, index=False)
files.download(output_file)




Saving phenogeno_CNU.xlsx to phenogeno_CNU (41).xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>