##

In [1]:
# !pip install cptac
import numpy as np
import pandas as pd
import cptac as cptac
import biograder
biograder.download("bio462_hw4")
hw = biograder.bio462_hw4(student_id = 'CG_Final_Project')

                                               

### Part 1: Data Loading and Processing

##

In [2]:
# download BRCA data
cptac.download('brca')
brca = cptac.Brca()
brca_CNV = brca.get_CNV()

                                         

In [35]:
brca_CNV_t = brca_CNV.transpose()

# get locations data
locations = hw.getData(name='gene_locations')
locations = locations.copy()
locations = locations.set_index("Database_ID")

# join locations and BRCA data
merged_df = locations.merge(brca_CNV_t, left_on='Database_ID', right_on='Database_ID')
merged_nd = merged_df[~merged_df.index.duplicated(keep='first')]

# select for chromosme 17 and order by starting position
brca_cnv_chr = merged_nd.loc[merged_nd['chromosome'] == '17']
brca_cnv_chr = brca_cnv_chr.sort_values(by=['start_bp'])

# re-organize (transpose BRCA data)
brca_cnv_chr.drop(brca_cnv_chr.columns[[0,1,2]], axis=1, inplace=True)
brca_cnv_chr_t = brca_cnv_chr.transpose()

In [36]:
# quick preview of brca data
brca_cnv_chr_t.shape

(122, 1333)

In [10]:
# # grab mini version of data frame
# brca_mini_df = brca_cnv_chr.head(100)
# # get first patient CNV data across all genes
# brca_p1 = brca_mini_df.iloc[:,1]
# brca_p1

In [37]:
# get first patient CNV data across all genes
brca_p1 = brca_cnv_chr_t.iloc[1,:]
brca_p1

Database_ID
ENSG00000262359.1    -0.222
ENSG00000181031.11   -0.222
ENSG00000187624.7    -0.222
ENSG00000183688.4    -0.222
ENSG00000141252.15   -0.222
                      ...  
ENSG00000167363.9     0.222
ENSG00000141556.16    0.222
ENSG00000141579.6     0.222
ENSG00000175711.4     0.222
ENSG00000176845.8     0.222
Name: CPT001846, Length: 1333, dtype: float64

In [40]:
# get number of columns
num_rows = brca_cnv_chr_t.shape[0]
num_cols = brca_cnv_chr_t.shape[1]

In [42]:
# separate data frame into its columns
brca_by_patients = np.zeros((num_rows, num_cols))
for row in range(num_rows):
    brca_by_patients[row] = brca_cnv_chr_t.iloc[row,:]

In [43]:
brca_by_patients[0]

array([-0.445, -0.445, -0.445, ...,  0.004,  0.004,  0.004])

### Part 2: Hidden Markov Models (HMMs)

Initializations
* initial state probabilities: [0.25, 0.25, 0.25, 0.25]
* transition probabilities: [4x4]
* emission probabiltiies: [4 x distribution of numbers]

In [46]:
# initialize initial state probabilities
num_subtypes = 4
initial_probs = np.ones(num_subtypes)
for i in range(len(initial_probs)):
    initial_probs[i] /= num_subtypes
print(initial_probs)

[0.25 0.25 0.25 0.25]


In [49]:
# initialize transition probabilities matrix
trans_mtx = np.ones((num_subtypes, num_subtypes))
for i in range(num_subtypes):
    for j in range(num_subtypes):
        trans_mtx[i][j] /= num_subtypes
print(trans_mtx)

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


In [None]:
# initialize emission probabilties matrix
# ALL YOURS, LORD THOMAS!!!!!! :)