# An exploration of internal ribosome entry sites (IRES) in single-stranded RNA (ssRNA) viruses

### Sarah Johnson and Nicholas Forino - BIOL 419

Our investigations will focus on understanding how IRESs are distributed across virus families, identifying which virus genomes are the most IRES-rich, contain the most "potent" IRESs.

In [None]:
# preliminaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

In [None]:
# loading the data

# this is the table that contains IRES activity for the ssRNA coding sequences (CDS) by oligo index
data = pd.read_excel('aad4939_Table_S8.xlsx', skip_footer = 277, skiprows = np.arange(1, 378), header = 0)

print('Our data has shape:', data.shape)
data

In [None]:
############ start of sarah's work ##############

# test with one virus 
swine = data[data['Accession '].str.contains('NC_018668')]
swine
plt.plot(swine.loc[:, 'Oligo_start_position'], swine.loc[:, 'eGFP_expression (a.u)'])


In [None]:
# test with one virus
swine_position = np.zeros(swine.shape[0])
swine.shape[0]
swine_array = swine.values
for j in np.arange(swine.shape[0]):
    swine_position[j] = swine_array[j, 6]/swine_array[-1, 6]
swine_position
plt.plot(swine_position, swine.loc[:, 'eGFP_expression (a.u)'])


In [None]:
# plot expression levels across the total length of the genome
viruses = np.unique(data.ix[:, 1])

for i in viruses:
    expression = data[data['Accession '].str.contains(i)]
        
    plt.plot(expression.loc[:, 'Oligo_start_position'], expression.loc[:, 'eGFP_expression (a.u)'])
    plt.title('GFP expression over absolute genome')
    plt.ylabel('GFP expression')
    plt.xlabel('gene length in base pairs')

In [None]:
# plot relative expression levels across the length of the genome

for i in viruses:
    expression = data[data['Accession '].str.contains(i)]
    
    position = np.zeros(expression.shape[0])
    expression_array = expression.values
    
    for j in np.arange(expression.shape[0]):
        position[j] = expression_array[j, 6]/expression_array[-1, 6]
        
    plt.plot(position, expression.loc[:, 'eGFP_expression (a.u)'])
    plt.title('GFP expression over relative genome length')
    plt.ylabel('GFP expression')
    plt.xlabel('relative gene length (sequence position/total gene length)')


In [None]:
# create an array of the virus class for each accession number
########### end of sarah's work ##############