In [None]:
# Pancreatic Cancer Detection Notebook

In [5]:
import pandas as pd
import numpy as np

# Data Loading and Preprocessing 

Ideas we should probably do for both datasets:
    - load data. remove any Null/NaN data, see if there are negative values and remove those if applicable.
    - if blank data cells, impute the data. scale the appropriate data columns


    - split the dataset into X and y subsets for passing into models
        - which really means also split into 80/20; 80% training(validation included) and 20% test

    - Note: we should do some data visualizations --> heatmaps? histograms of label distributions for proof
        - of good splits. mutual information??? (need to better understand if its useful)


In [6]:
# Loading urinary biomarker dataset
urinary_df = pd.read_csv('urinaryBiomarkerData.csv')
urinary_df = urinary_df.drop(columns=['sample_id','patient_cohort','sample_origin','age','sex','stage','benign_sample_diagnosis','plasma_CA19_9','REG1A'])
print(urinary_df)

     diagnosis  creatinine     LYVE1       REG1B         TFF1
0            1     1.83222  0.893219   52.948840   654.282174
1            1     0.97266  2.037585   94.467030   209.488250
2            1     0.78039  0.145589  102.366000   461.141000
3            1     0.70122  0.002805   60.579000   142.950000
4            1     0.21489  0.000860   65.540000    41.088000
..         ...         ...       ...         ...          ...
585          3     0.52026  7.058209  156.241000   525.178000
586          3     0.85956  8.341207   16.915000   245.947000
587          3     1.36851  7.674707  289.701000   537.286000
588          3     1.33458  8.206777  205.930000   722.523000
589          3     1.50423  8.200958  411.938275  2021.321078

[590 rows x 5 columns]


In [14]:
# Preprocessing urinary biomarker dataset
# Note: There are no negative or Null/NaN values in the dataset
# So, no need to remove samples or impute the data
# Scale the appropriate data columns

diagnosis     0
creatinine    0
LYVE1         0
REG1B         0
TFF1          0
dtype: int64

# Model Creation

- we want to initialize all 4 models, probably default hyperparameters for most values.
    - it's 4 models per dataset, so we'll have 8 in total

# Training and Validation

- Train the models using 10-fold cross validation 
    - we can probably do this in 2 separate cross-validation loops. 
        - first one is for urinary dataset and we train/validate all 4 models (get their scores and loss? too)
        - repeat for second dataset

# Testing Models

- Test each of the models on the appropriate dataset and store values


# Significance Analysis of Results

- Do the McNemar test here for each of the models (8?) to compare

# Comparison Across Datasets

- take urinary dataset models and try to predict on other dataset's test samples and vice versa
    - draw conclusions on the accuracy percentage and based on this, we can determine if the urinary biomarkers
        - are generalizeable to use on other data for predicting pancreatic cancer

# ------------- NON URINARY DATA WORK --------

### Normal pancreas RNA-seq data

In [58]:
# Normal pancreas dataset 1
markers = ['LYVE1','TFF1','REG1B','GPX1']

rawPancNormData1 = pd.read_csv("pancreaticNormalSeqData/GSE205163_znf808_ko_raw_counts_S0-S4.tsv",  sep='\t')
filtRawPancNormData1 = rawPancNormData1[rawPancNormData1['Gene'].str.endswith(('LYVE1','REG1B','TFF1','GPX1'))]
filtRawPancNormData1 = filtRawPancNormData1.set_index('Gene')
filtRawPancNormData1 = filtRawPancNormData1.transpose()
filtRawPancNormData1 = filtRawPancNormData1.reset_index(drop=True)
filtRawPancNormData1.columns = markers
print(filtRawPancNormData1)

    LYVE1  TFF1  REG1B  GPX1
0       1     0      0  5925
1       0     0      0  6520
2       0    14      0  4899
3       0     7      1  5882
4       0     4      0  4279
5       0    12      0  3314
6       0     1      0  4659
7       0     0      0  4370
8       0    11      0  4033
9       0     5      0  2866
10      1     0      0  7057
11      1     0      0  5737
12      0     6      0  5096
13      0     8      2  4481
14      2     4      0  3867
15      0    11      0  4731
16      0     4      0  4539
17      0     2      0  4218
18      1     6      0  3802
19      0    10      0  3586
20      3     0      0  8534
21      0     0      0  7313
22      0    12      0  7994
23      0     8      1  5321
24      0     3      0  4551
25      3     5      0  3980
26      0     0      0  3512
27      0     3      0  3196
28      0     7      0  2899
29      2     9      0  2725


In [49]:
# Normal pancreas dataset 2
rawPancNormData2 = pd.read_csv("pancreaticNormalSeqData/GSE216854_normalized_counts.txt", sep='\t')
filtRawPancNormData2 = rawPancNormData2[rawPancNormData2['gene'].isin(markers)]
filtRawPancNormData2 = filtRawPancNormData2.set_index('gene')
filtRawPancNormData2 = filtRawPancNormData2.transpose()
filtRawPancNormData2 = filtRawPancNormData2.reset_index(drop=True)
print(filtRawPancNormData2)

gene     REG1B         GPX1       TFF1     LYVE1
0     0.000000  1338.152011   0.743831  1.487662
1     0.000000  1317.741727  11.420428  0.000000
2     0.000000  1233.423715   7.913809  0.000000
3     0.000000   987.178392   7.446389  2.127540
4     0.000000   914.030573  10.528101  0.000000
5     0.000000   791.104182  18.776207  1.251747
6     0.000000   911.572944   5.683123  2.273249
7     0.000000   833.529266   6.872444  0.000000
8     0.000000   781.476317   1.154322  1.154322
9     0.000000  1594.628915   0.000000  0.000000
10    0.000000  1626.013137   0.832999  0.000000
11    0.000000  1660.088216   0.996452  0.000000
12    0.000000  2062.778781   0.000000  0.000000
13    0.000000  1804.216658   0.000000  0.000000
14    0.000000  1785.490649   0.000000  0.000000
15    1.186684  3163.700430   2.373369  2.373369
16    0.000000  2551.999220   0.000000  0.000000
17    0.000000  2735.777773   0.857342  0.000000
18    0.000000   967.223814   1.952673  0.000000
19    0.000000  1044

In [57]:
# Normal pancreas dataset 3
rawPancNormData3 = pd.read_csv("pancreaticNormalSeqData/GSE228662_RNA_raw_read_counts.tsv", sep='\t')
filtRawPancNormData3 = rawPancNormData3[rawPancNormData3['symbol'].isin(markers)]
filtRawPancNormData3 = filtRawPancNormData3.set_index('symbol')
filtRawPancNormData3 = filtRawPancNormData3.drop(columns=['chrom','start','end','gene'])
filtRawPancNormData3 = filtRawPancNormData3.transpose()
filtRawPancNormData3 = filtRawPancNormData3.reset_index(drop=True)
print(filtRawPancNormData3)

symbol  LYVE1  TFF1  REG1B  GPX1
0           0   117      0  2187
1           0   189      0  2722
2           1   127      0  1834
3           1    87      0  1795
4           0   122      0  1796
..        ...   ...    ...   ...
62          0    40      0  1087
63          2    25      0  1412
64          0    14      0  1539
65          1    26      0  1208
66          0    22      0  1436

[67 rows x 4 columns]


In [None]:
# Merging all 3 normal pancreas datasets
filtRawPancNormData3
#1: LYVE1  TFF1  REG1B  GPX1
#2: REG1B  GPX1  TFF1   LYVE1
#3: LYVE1  TFF1  REG1B  GPX1

# swap cols in dataset 2
# merge all 3 datasets

In [59]:
# Processing normal pancreas data

# impute data
# drop NA?

### Pancreatic cancer RNA-seq data

In [43]:
rawPancCancData1 = pd.read_csv("pancreaticCancerSeqData/GSE232860_allsamples.deseq.normalized.counts.csv")
#print(rawPancCancData1.shape)


#Slow loading, 48,553 rows
#rawPancCancData2 = pd.read_excel("pancreaticCancerSeqData/GSE245306_FKPM.xlsx")
#print(rawPancCancData2.shape)

#59050 rows
rawPancCancData3 = pd.read_csv("pancreaticCancerSeqData/tumor.counts.sub.tsv", sep='\t')
#print(rawPancCancData3)

In [65]:
#Only has Reg1, not Reg1B or Reg1A, so we take Reg1 and apply it to both rows
rowsToKeep = ["Gpx1", "Lyve1", "Reg1", "Tff1", "Reg1"]
rawPancCancData1.rename(columns={'Unnamed: 0': 'GeneNames'}, inplace=True)
print(rawPancCancData1.shape)

filtRawPancCancData1 = rawPancCancData1[rawPancCancData1['GeneNames'].isin(rowsToKeep)]
filtRawPancCancData1 = filtRawPancCancData1.transpose()
filtRawPancCancData1.reset_index(inplace=True, drop=True)
filtRawPancCancData1.columns = filtRawPancCancData1.iloc[0]
filtRawPancCancData1 = filtRawPancCancData1[1:]

filtRawPancCancData1["Reg1B"] = filtRawPancCancData1["Reg1"].copy()
filtRawPancCancData1.rename(columns={'Reg1': 'Reg1A'}, inplace=True)

# Impute missing values in each column if needed
filtRawPancCancData1 = filtRawPancCancData1.fillna(filtRawPancCancData1.median())
print(filtRawPancCancData1)





(17966, 23)
0        Tff1          Reg1A       Lyve1          Gpx1          Reg1B
1   86.519650    1152.208802  111.477242   7915.716067    1152.208802
2   99.259037    2032.652444   81.133473   6992.151782    2032.652444
3   95.638935    6535.034757   80.722771   8339.013202    6535.034757
4   42.385798   10158.844770   61.860354   7007.403411   10158.844770
5    6.260937   39103.310850  152.766873  10529.644580   39103.310850
6   33.793363    6810.356633  189.839188   7140.338886    6810.356633
7    4.244787    3628.231888  217.545346   8391.944362    3628.231888
8    2.204342   60100.287620  598.478904   6179.873328   60100.287620
9    4.750487    3313.939555  153.915771   7932.362771    3313.939555
10  21.847610   16311.226750  294.942730   6057.746312   16311.226750
11  42.165889    2393.872495  151.413873   8880.711134    2393.872495
12  35.099226    1474.167487  128.697162   6995.958204    1474.167487
13   0.000000   82462.293950   26.049017   8902.410494   82462.293950
14   7.9