# Further Analysis of Neurotransmitter Genes and Patient Survival

Now we can focus on determining the effect that gene expression from our various neurotransmitter genes of interest have on patient survival rates. 

We can start by importing our necessary data tables

## Part 0: Data Preparation

In [1]:
# importing data analysis libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# fixes the setting with copy warning
pd.options.mode.chained_assignment = None

In [20]:
def filter_survival_table(file_path):
    full_table = pd.read_csv('datasets/TCGA_PAAD_survival_data.csv', delimiter=',')
    survival_filtered = full_table[full_table.columns.intersection(['submitter_id', 'Disease Free Status',
                                              'Overall Survival (Months)', 'Overall Survival Status'])]
    
    return survival_filtered


In [3]:
survival = pd.read_csv('datasets/TCGA_PAAD_survival_data.csv', delimiter=',')

In [4]:
survival

Unnamed: 0,Cancer_type,submitter_id,PNI,LVI,Filename,Diagnosis Age,Sex,Race Category,Disease Free (Months),Disease Free Status,Overall Survival (Months),Overall Survival Status
0,PAAD,TCGA-HZ-7923,0,0,42bec5f7-7623-42e6-bbdf-514fe3805940.htseq.cou...,65,Male,WHITE,10.32,DiseaseFree,10.32,LIVING
1,PAAD,TCGA-US-A776,0,0,19a3b9bb-d4cb-4925-a87a-57f724141a67.htseq.cou...,61,Male,WHITE,39.95,DiseaseFree,39.95,LIVING
2,PAAD,TCGA-HZ-A77Q,0,0,03630a0c-aa97-4e28-bac9-0206fff669cd.htseq.cou...,55,Female,WHITE,1.08,DiseaseFree,1.08,LIVING
3,PAAD,TCGA-2J-AABA,0,0,c4cadcbb-ae87-43a5-811c-6bedda1d1d8d.htseq.cou...,55,Male,WHITE,15.51,Recurred/Progressed,19.94,DECEASED
4,PAAD,TCGA-F2-6879,0,0,16c63027-f745-41c4-a5e8-f6d9f1fbf1c8.htseq.cou...,57,Male,WHITE,6.01,Recurred/Progressed,10.97,DECEASED
...,...,...,...,...,...,...,...,...,...,...,...,...
149,PAAD,TCGA-2J-AABO,1,-,c642e018-f0cb-4be8-9b19-c944f1daf9cf.htseq.cou...,43,Male,WHITE,14.45,Recurred/Progressed,14.45,LIVING
150,PAAD,TCGA-H6-A45N,1,-,1f997074-0020-47e6-9928-5bf7209c552d.htseq.cou...,88,Female,WHITE,11.93,Recurred/Progressed,13.83,DECEASED
151,PAAD,TCGA-2L-AAQI,1,-,d5612378-33e0-4fe5-ad2f-a1887fb7b5cd.htseq.cou...,66,Male,WHITE,,,3.38,DECEASED
152,PAAD,TCGA-2L-AAQJ,1,-,d7d3fe8e-3885-44e8-934c-50f2a3bbfb2f.htseq.cou...,49,Female,WHITE,,,12.94,DECEASED


In [21]:
survival_filtered = filter_survival_table('datasets/TCGA_PAAD_survival_data.csv')

survival_filtered

Unnamed: 0,submitter_id,Disease Free Status,Overall Survival (Months),Overall Survival Status
0,TCGA-HZ-7923,DiseaseFree,10.32,LIVING
1,TCGA-US-A776,DiseaseFree,39.95,LIVING
2,TCGA-HZ-A77Q,DiseaseFree,1.08,LIVING
3,TCGA-2J-AABA,Recurred/Progressed,19.94,DECEASED
4,TCGA-F2-6879,Recurred/Progressed,10.97,DECEASED
...,...,...,...,...
149,TCGA-2J-AABO,Recurred/Progressed,14.45,LIVING
150,TCGA-H6-A45N,Recurred/Progressed,13.83,DECEASED
151,TCGA-2L-AAQI,,3.38,DECEASED
152,TCGA-2L-AAQJ,,12.94,DECEASED


Attempt 1 to filter rnaseq table

In [5]:
all_rnaseq = pd.read_csv('datasets/Sophia_PAAD_RNAseq.csv', delimiter = ',')
all_rnaseq.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
def create_counts_list(patient_table):
    
    count_files = patient_table['Filename'].tolist()
    
    count_files_list = []
    # the filenames column is formatted differently from the column names in the RNASeq table
    for file in count_files:
        remove_dash = file.replace('-', '.')
        #remove_tab = remove_dash.replace('\t', '')
        count_files_list.append(remove_dash)
    
    return count_files_list

In [11]:
def filter_patients(rnaseq_table):
    count_files = create_counts_list(survival)
    rnaseq_survival = all_rnaseq[all_rnaseq.columns.intersection(count_files)]
    first_column = rnaseq_table['hgnc_symbol']
    rnaseq_survival.insert(0, 'hgnc_symbol', first_column)
    
    return rnaseq_survival

In [12]:
rnaseq_survival = filter_patients(all_rnaseq)

In [13]:
rnaseq_survival

Unnamed: 0,hgnc_symbol,a0f5f7d4.88e0.4f3b.853b.e1e4f6bca748.htseq.counts.gz,a19219b1.db59.4493.83ef.e938e2ffdefd.htseq.counts.gz,a2a33be8.232b.44bf.a003.349017a5bc5a.htseq.counts.gz,a6ad90fe.ccfe.47ce.9e5a.95f5e7acf761.htseq.counts.gz,a995d6ba.19c7.498f.b2d8.3f9b4b4826a1.htseq.counts.gz,a9fbe593.42f7.4597.a61c.240408bbe203.htseq.counts.gz,aab761be.87c9.41a2.99b8.8bed9032333c.htseq.counts.gz,aec2e0c7.4792.41af.873c.3f3a53ec6d38.htseq.counts.gz,aeeb2bc3.c26d.4988.b4d6.425c7b2db8ae.htseq.counts.gz,...,f144de50.6126.4912.9c94.824d1eb0fac5.htseq.counts.gz,f2389819.b8fc.460e.821c.01dba313cce1.htseq.counts.gz,f6bd7191.a820.4d86.927a.b4b5f88ebd67.htseq.counts.gz,f748bf78.4dc1.47ad.8611.8186479d3e4b.htseq.counts.gz,f8551a29.d4bd.4954.bf9c.8e10265063de.htseq.counts.gz,f9f63982.b0ee.4cb8.8de5.f885d82137f0.htseq.counts.gz,fb65f821.92cb.402a.ad2f.d4044ca7de4d.htseq.counts.gz,fcd43085.7338.43fe.bc25.9d87b04e227f.htseq.counts.gz,feb22766.4282.47c8.bfe2.7d020b4a15d4.htseq.counts.gz,fef65b57.c58d.4050.8de4.f09f5cd616ce.htseq.counts.gz
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A1CF,3250,2529,4963,4286,3840,678,4350,6513,3015,...,2424,2757,4434,4202,2321,2500,4916,6074,3980,2784
3,A2M,72,65,71,76,39,51,38,37,36,...,42,70,50,49,75,95,74,108,17,100
4,A2M-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,8,7,19,18,2,2,40,1,25,...,6,6,10,32,28,82,0,74,8,22
37334,ZYG11B,6618,4388,3195,3185,1231,1312,1759,3044,1821,...,2067,2942,2992,3023,4155,1693,2145,2870,3623,2814
37335,ZYX,771,551,1058,1032,960,353,1258,4284,1458,...,2489,741,2192,1739,3457,1376,2757,2559,1388,1098
37336,ZYXP1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


tests came up a little weird, the count files are not exact matches

### Let's try to use the submitter_id to filter instead, using the PAAD patients file for filenames/count_files

In [23]:
def filter_by_submitter_id(rnaseq_table, patient_table):
    submitter_id_list = patient_table['submitter_id'].tolist()
    all_paad_patients = pd.read_csv('datasets/Sophia_PAAD_patients.csv', delimiter = ',')
    paad_patients = all_paad_patients[all_paad_patients['submitter_id'].isin(submitter_id_list)]
    # nothing was filtered, all submitter ids present in both tables
    counts = create_counts_list(paad_patients)
    
    first_column = rnaseq_table['hgnc_symbol']
    rnaseq_survival = all_rnaseq[all_rnaseq.columns.intersection(counts)]
    rnaseq_survival.insert(0, 'hgnc_symbol', first_column)
    return rnaseq_survival

rnaseq_survival_byid = filter_by_submitter_id(all_rnaseq, survival)

In [25]:
rnaseq_survival_byid

Unnamed: 0,hgnc_symbol,a0f5f7d4.88e0.4f3b.853b.e1e4f6bca748.htseq.counts.gz,a19219b1.db59.4493.83ef.e938e2ffdefd.htseq.counts.gz,a2a33be8.232b.44bf.a003.349017a5bc5a.htseq.counts.gz,a6ad90fe.ccfe.47ce.9e5a.95f5e7acf761.htseq.counts.gz,a995d6ba.19c7.498f.b2d8.3f9b4b4826a1.htseq.counts.gz,a9fbe593.42f7.4597.a61c.240408bbe203.htseq.counts.gz,aab761be.87c9.41a2.99b8.8bed9032333c.htseq.counts.gz,aec2e0c7.4792.41af.873c.3f3a53ec6d38.htseq.counts.gz,aeeb2bc3.c26d.4988.b4d6.425c7b2db8ae.htseq.counts.gz,...,f144de50.6126.4912.9c94.824d1eb0fac5.htseq.counts.gz,f2389819.b8fc.460e.821c.01dba313cce1.htseq.counts.gz,f6bd7191.a820.4d86.927a.b4b5f88ebd67.htseq.counts.gz,f748bf78.4dc1.47ad.8611.8186479d3e4b.htseq.counts.gz,f8551a29.d4bd.4954.bf9c.8e10265063de.htseq.counts.gz,f9f63982.b0ee.4cb8.8de5.f885d82137f0.htseq.counts.gz,fb65f821.92cb.402a.ad2f.d4044ca7de4d.htseq.counts.gz,fcd43085.7338.43fe.bc25.9d87b04e227f.htseq.counts.gz,feb22766.4282.47c8.bfe2.7d020b4a15d4.htseq.counts.gz,fef65b57.c58d.4050.8de4.f09f5cd616ce.htseq.counts.gz
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A1CF,3250,2529,4963,4286,3840,678,4350,6513,3015,...,2424,2757,4434,4202,2321,2500,4916,6074,3980,2784
3,A2M,72,65,71,76,39,51,38,37,36,...,42,70,50,49,75,95,74,108,17,100
4,A2M-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,8,7,19,18,2,2,40,1,25,...,6,6,10,32,28,82,0,74,8,22
37334,ZYG11B,6618,4388,3195,3185,1231,1312,1759,3044,1821,...,2067,2942,2992,3023,4155,1693,2145,2870,3623,2814
37335,ZYX,771,551,1058,1032,960,353,1258,4284,1458,...,2489,741,2192,1739,3457,1376,2757,2559,1388,1098
37336,ZYXP1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


so basically in the above portions, the first route we took was to pull Filenames directly out of the survival table, and then apply those Filenames onto the columns of our rnaseq table.

the second route was to pull submitter_id out of the survival table, lookup the Filenames from the submitter_id in the paad_patients file, and then filter the columns of the rnaseq table.

In [24]:
def difference_between_two_lists(list1, list2):
    return list(set(list1) - set(list2))

difference_between_two_lists(rnaseq_survival_byid.columns.values, rnaseq_survival.columns.values)

['cd1bba0f.b2e8.45e3.9b37.dcec5472cb7b.htseq.counts.gz']