In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib

Using matplotlib backend: MacOSX


In [4]:
"""Script to combine the non varaint and gene data to display only intersection."""
import timeit
import math

start_time = timeit.default_timer()


def conv_to_string(gene_list, index):
    """Pull out all the values in gene_list at the index
        'index' and joins them with a comma. The resulting string is
        returned.

        gene_list: list
        index: int
    """
    return ",".join([str(item[index]) for item in gene_list])


def conv_to_string_NVR(gene_list, index, start_val, stop_val):
    """Pull out all the values in gene_list at the index
        'index', finds the percentage voerlap and joins 
        them with a comma. The resulting string is returned.

        gene_list: list
        index: int
        start_val: int
        stop_val: int
    """
    rng = stop_val - start_val
    temp_list = [item[index] / float(rng) * 100 for item in gene_list]
    return_list =  [0 if math.isnan(x) else x for x in temp_list]
    return ','.join(str(e) for e in return_list)

def conv_to_string_gene(gene_list, index, index2):
    """Pull out all the values in gene_list at the index
        'index', finds the percentage voerlap and joins 
        them with a comma. The resulting string is returned.
        
        gene_list: list
        index: int
    """
    temp_list = [item[index] / float(item[index2]) * 100 for item in gene_list]
    return_list =  [0 if math.isnan(x) else x for x in temp_list]
    return ','.join(str(e) for e in return_list)



def add_to_region(list_of_genes, data_frame, start, stop, type_overlap):
    """
    Append dataframe results ot the list in the format below
    """
    for i in xrange(len(data_frame)):
        diff_gene = data_frame.iloc[i,2] -  data_frame.iloc[i,1] + 1
        if type_overlap in [1,4]:
            diff_region = data_frame.iloc[i,2] - start + 1
        elif type_overlap in [2,3,5,6]:
            diff_region = stop - start + 1
        elif type_overlap in [8,9]:
            diff_region = stop - data_frame.iloc[i,1] + 1
        else:
            diff_region = data_frame.iloc[i,2]- data_frame.iloc[i,1] + 1
        list_of_genes.append([data_frame.iloc[i,1], data_frame.iloc[i,2],  data_frame.iloc[i,3],
                              type_overlap, diff_region, diff_gene])
#         print type(diff_region)
#         print type(diff_gene)
    return list_of_genes

# read the two files (the first is the gene file and the second is either NVR or VR file)
df = pd.read_csv('gene_start_stop_no_dups.csv', names=['chr', 'start', 'stop', 'name'], header=None)
df1 = pd.read_csv('../variable_regions_update.txt', sep='\t', names=['chr', 'start', 'stop'], header=None)

# list of choromozomes and thier sizes (sizes irrlevant for this task)
all_chr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
           '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']

# iterate through each NVR range and incrementally add the genes that match
with open('variable_regions_and_genes_update.txt', 'ab') as myfile:

    # add the headers to the file
    string = 'chr' + '\t' + 'VR_start' + '\t' + 'VR_stop' + '\t' + 'Overlapping_Gene_Start' + '\t' + \
        'Overlapping_Gene_Stop' + '\t' + 'Overlapping_GeneID' + '\t' + 'Overlap_Type' + '\t' + \
        'Number_of_Overlapping_Genes' + '\t' + 'Percentage_of_Gene_Overlapping_VR' + '\t' + \
        'Percentage_of_VR_Overlapping_Gene' + '\n'
    myfile.write(string)

    for chrm in all_chr:

        # choose chromosomes only in all_chr list since there maybe others
        to_keep = [chrm]

        # only keep rows that contain the string in to_keep
        df_gene = df[df['chr'].isin(to_keep)]
#         df_gene = df_gene.sort_values(by='start')

        df_nvr = df1[df1['chr'].isin(to_keep)]
        df_nvr = df_nvr.sort_values(by='start')

        # initalize variable to store gene name, start, stop and the overlap type
        gene_non_variable_regions = []
        
        for i in xrange(len(df_nvr)):

            # for each row get the start and stop point for NVR in
            # NVR text file

            nvr_start = df_nvr.iloc[i, 1] # NVR_start
            nvr_stop = df_nvr.iloc[i, 2] # NVR_stop
            
            df_result = df_gene[(df_gene['start'] < nvr_start) & (df_gene['stop'] < nvr_stop) & 
                                (df_gene['stop'] > nvr_start)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 1)
            df_result = df_gene[(df_gene['start'] < nvr_start) & (df_gene['stop'] == nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 2)
            df_result = df_gene[(df_gene['start'] < nvr_start) & (df_gene['stop'] > nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 3)
            df_result = df_gene[(df_gene['start'] == nvr_start) & (df_gene['stop'] < nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 4)
            df_result = df_gene[(df_gene['start'] == nvr_start) & (df_gene['stop'] == nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 5)
            df_result = df_gene[(df_gene['start'] == nvr_start) & (df_gene['stop'] > nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 6)
            df_result = df_gene[(df_gene['start'] > nvr_start) & (df_gene['stop'] < nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 7)
            df_result = df_gene[(df_gene['start'] > nvr_start) & (df_gene['stop'] == nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 8)
            df_result = df_gene[(df_gene['start'] > nvr_start) & (df_gene['stop'] > nvr_stop) &
                                (df_gene['start'] < nvr_stop)]
            gene_non_variable_regions = add_to_region(gene_non_variable_regions, df_result, nvr_start, nvr_stop, 9)
            
            # if there is an overlap use conv_to_string to convert each item in the
            # list to a comma separated string, else add '-' in the respective
            # columns
            if len(gene_non_variable_regions) > 0:
                string = chrm + '\t' + str(nvr_start) + '\t' + str(nvr_stop) + '\t' + \
                conv_to_string(gene_non_variable_regions, 0) + '\t' + \
                conv_to_string(gene_non_variable_regions, 1) + '\t' + \
                conv_to_string(gene_non_variable_regions, 2) + '\t' + \
                conv_to_string(gene_non_variable_regions, 3) + '\t' + \
                str(len(gene_non_variable_regions)) + '\t' + \
                conv_to_string_gene(gene_non_variable_regions, 4, 5) + '\t' + \
                conv_to_string_NVR(gene_non_variable_regions, 4, nvr_start, nvr_stop) + '\n'
            else:
                string = chrm + '\t' + str(nvr_start) + '\t' + str(nvr_stop) + '\t' + '-' + '\t' + '-' + '\t' + \
                '-' + '\t' + '-' + '\t' + str(0) + '\t' + str(0) + '\t' + str(0) + '\n'
            myfile.write(string)

            gene_non_variable_regions = []

        print "finished " + str(chrm)

stop_time = timeit.default_timer()
print "TIME  finished"
print stop_time - start_time

finished 1
finished 2
finished 3
finished 4
finished 5
finished 6
finished 7
finished 8
finished 9
finished 10
finished 11
finished 12
finished 13
finished 14
finished 15
finished 16
finished 17
finished 18
finished 19
finished 20
finished 21
finished 22
finished X
finished Y
TIME  finished
344.199320078
