In [None]:
# Pancreatic Cancer Detection Notebook

In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Data Loading and Preprocessing 

Ideas we should probably do for both datasets:
    - load data. remove any Null/NaN data, see if there are negative values and remove those if applicable.
    - if blank data cells, impute the data. scale the appropriate data columns


    - split the dataset into X and y subsets for passing into models
        - which really means also split into 80/20; 80% training(validation included) and 20% test

    - Note: we should do some data visualizations --> heatmaps? histograms of label distributions for proof
        - of good splits. mutual information??? (need to better understand if its useful)


In [6]:
# Loading urinary biomarker dataset
urinary_df = pd.read_csv('urinaryBiomarkerData.csv')
urinary_df = urinary_df.drop(columns=['sample_id','patient_cohort','sample_origin','age','sex','stage','benign_sample_diagnosis','plasma_CA19_9','REG1A'])
print(urinary_df)

     diagnosis  creatinine     LYVE1       REG1B         TFF1
0            1     1.83222  0.893219   52.948840   654.282174
1            1     0.97266  2.037585   94.467030   209.488250
2            1     0.78039  0.145589  102.366000   461.141000
3            1     0.70122  0.002805   60.579000   142.950000
4            1     0.21489  0.000860   65.540000    41.088000
..         ...         ...       ...         ...          ...
585          3     0.52026  7.058209  156.241000   525.178000
586          3     0.85956  8.341207   16.915000   245.947000
587          3     1.36851  7.674707  289.701000   537.286000
588          3     1.33458  8.206777  205.930000   722.523000
589          3     1.50423  8.200958  411.938275  2021.321078

[590 rows x 5 columns]


In [15]:
# Preprocessing urinary biomarker dataset
# NOTE: There are no negative or Null/NaN values in the dataset
    # So, no need to remove samples or impute the data
    # Only need to scale the appropriate data columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
val_cols = urinary_df.loc[:, ~urinary_df.columns.isin(['diagnosis'])]
urinary_df[val_cols.columns] = scaler.fit_transform(val_cols)
print(urinary_df)

     diagnosis  creatinine     LYVE1     REG1B      TFF1
0            1    1.529927 -0.631661 -0.299975  0.055876
1            1    0.183680 -0.298597 -0.088256 -0.384680
2            1   -0.117454 -0.849256 -0.047976 -0.135425
3            1   -0.241451 -0.890812 -0.261065 -0.450584
4            1   -1.003143 -0.891378 -0.235767 -0.551475
..         ...         ...       ...       ...       ...
585          3   -0.524871  1.162636  0.226755 -0.071998
586          3    0.006542  1.536048 -0.483726 -0.348568
587          3    0.803662  1.342066  0.907324 -0.060005
588          3    0.750521  1.496923  0.480141  0.123466
589          3    1.016227  1.495229  1.530663  1.409888

[590 rows x 5 columns]


In [26]:
# Split data into training/validation and test (80/20)
urinary_df = urinary_df.sample(frac=1) # randomly shuffle data
eighty_percent = int(len(urinary_df) * 0.8)
train_urinary_data = urinary_df.loc[:eighty_percent-1]
test_urinary_data = urinary_df.loc[eighty_percent:]
print("train_urinary_data: \n", train_urinary_data)
print("test_urinary_data: \n", test_urinary_data)

train_urinary_data: 
      diagnosis  creatinine     LYVE1     REG1B      TFF1
242          2    2.025912 -0.859894 -0.527933 -0.424243
42           1    0.307676  1.233065  0.033406  0.819211
448          3   -1.215708 -0.408099 -0.505832 -0.211488
117          1   -0.223737 -0.890799 -0.569977 -0.553488
331          2    0.077397 -0.747354 -0.302675  0.266234
..         ...         ...       ...       ...       ...
543          3   -0.967715  2.200545  3.062721  0.185451
495          3   -0.542585 -0.243795 -0.446711 -0.239743
98           1    0.165966 -0.810302  0.136955 -0.422724
66           1   -1.073998 -0.891431 -0.565114 -0.592159
471          3   -1.180281  0.339548 -0.285027 -0.547370

[154 rows x 5 columns]
test_urinary_data: 
      diagnosis  creatinine     LYVE1     REG1B       TFF1
472          3   -0.737436  0.740629  0.132556   1.122018
268          2   -0.347733 -0.728078 -0.398214  -0.242570
19           1   -0.542585 -0.721946 -0.500534  -0.327524
180          1   

# Model Creation
5 classifiers: Random Forest Classifier. XGBoost, Support Vector Machine, Gaussian Naive Bayes, and K Neighbors Classifier
- we want to initialize all 5 models, probably default hyperparameters for most values.
    - it's 5 models per dataset, so we'll have 10 in total

In [24]:
# ! pip3 install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 3.6 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [28]:
# NOTE: add/remove/adjust hyperparameters as you see fit
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = xgb.XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.01)
svm = SVC(C=1, kernel="rbf", random_state=42)
gnb = GaussianNB()
knn = KNeighborsClassifier()

# Training and Validation

- Train the models using 10-fold cross validation 
    - we can probably do this in 2 separate cross-validation loops. 
        - first one is for urinary dataset and we train/validate all 5 models (get their scores and loss? too)
        - repeat for second dataset

In [30]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

rf_scores = []
xgb_scores = []
svm_scores = []
gnb_scores = []
knn_scores = []

X = train_urinary_data
kf = KFold(n_splits=10)

for train_index, val_index in kf.split(X): #10 folds
    # set up training data
    train_chunk = X.iloc[train_index]
    y_train = train_chunk['diagnosis']
    X_train = train_chunk.drop(columns=['diagnosis'])

    # set up validation data
    val_chunk = X.iloc[val_index]
    y_val = val_chunk['diagnosis']
    X_val = val_chunk.drop(columns=['diagnosis'])

    # Random Forest
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_val)
    rf_score = accuracy_score(y_val, rf_pred)
    rf_scores.append(rf_score)

    # XGBoost
    le = LabelEncoder() # use to convert classes from [1,2,3] to [0,1,2]
    xgb_y_train = le.fit_transform(y_train)
    xgb.fit(X_train, xgb_y_train)
    xgb_pred = xgb.predict(X_val)
    xgb_pred = le.inverse_transform(xgb_pred)
    xgb_score = accuracy_score(y_val, xgb_pred)
    xgb_scores.append(xgb_score)

    # SVM
    svm.fit(X_train, y_train)
    svm_pred = svm.predict(X_val)
    svm_score = accuracy_score(y_val, svm_pred)
    svm_scores.append(svm_score)

    # Gaussian Naive Bayes
    gnb.fit(X_train, y_train)
    gnb_pred = gnb.predict(X_val)
    gnb_score = accuracy_score(y_val, gnb_pred)
    gnb_scores.append(gnb_score)

    # KNearestNeighbors
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_val)
    knn_score = accuracy_score(y_val, knn_pred)
    knn_scores.append(knn_score)

print("RF training accuracies: ", rf_scores)
print("RF mean: ", np.mean(rf_scores))
print("RF standard dev: ", np.std(rf_scores), "\n")

print("XGB training accuracies: ", xgb_scores)
print("XGB mean: ", np.mean(xgb_scores))
print("XGB standard dev: ", np.std(xgb_scores), "\n")

print("SVM training accuracies: ", svm_scores)
print("SVM mean: ", np.mean(svm_scores))
print("SVM standard dev: ", np.std(svm_scores), "\n")

print("GNB training accuracies: ", gnb_scores)
print("GNB mean: ", np.mean(gnb_scores))
print("GNB standard dev: ", np.std(gnb_scores), "\n")

print("KNN training accuracies: ", knn_scores)
print("KNN mean: ", np.mean(knn_scores))
print("KNN standard dev: ", np.std(knn_scores), "\n")

RF training accuracies:  [0.5625, 0.5, 0.625, 0.5625, 0.6, 0.5333333333333333, 0.6666666666666666, 0.5333333333333333, 0.5333333333333333, 0.6666666666666666]
RF mean:  0.5783333333333334
RF standard dev:  0.055646453415988485 

XGB training accuracies:  [0.5, 0.5625, 0.4375, 0.625, 0.6, 0.5333333333333333, 0.6666666666666666, 0.6, 0.6, 0.4]
XGB mean:  0.5525
XGB standard dev:  0.08047601437005245 

SVM training accuracies:  [0.6875, 0.5625, 0.625, 0.6875, 0.7333333333333333, 0.4, 0.6, 0.6, 0.5333333333333333, 0.6]
SVM mean:  0.6029166666666665
SVM standard dev:  0.08915206266698363 

GNB training accuracies:  [0.625, 0.5625, 0.625, 0.6875, 0.5333333333333333, 0.2, 0.7333333333333333, 0.5333333333333333, 0.5333333333333333, 0.4666666666666667]
GNB mean:  0.55
GNB standard dev:  0.13935615841751983 

KNN training accuracies:  [0.4375, 0.5, 0.5625, 0.5625, 0.6, 0.5333333333333333, 0.6666666666666666, 0.5333333333333333, 0.3333333333333333, 0.6666666666666666]
KNN mean:  0.539583333333333

# Testing Models

- Test each of the models on the appropriate dataset and store values


In [42]:
# set up test data
X_test = test_urinary_data.drop(columns=['diagnosis'])
y_test = test_urinary_data['diagnosis']

# Random Forest
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print("RF test accuracy: ", rf_acc)

# XGBoost
le = LabelEncoder() # use to convert classes from [1,2,3] to [0,1,2]
xgb_y_test = le.fit_transform(y_test)
xgb_pred = xgb.predict(X_test)
xgb_pred = le.inverse_transform(xgb_pred)
xgb_acc = accuracy_score(y_test, xgb_pred)
print("XGB test accuracy: ", xgb_acc)

# SVM
svm_pred = svm.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)
print("SVM test accuracy: ", svm_acc)

# Gaussian Naive Bayes
gnb_pred = gnb.predict(X_test)
gnb_acc = accuracy_score(y_test, gnb_pred)
print("GNB test accuracy: ", gnb_acc)

# KNearestNeighbors
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
print("KNN test accuracy: ", knn_acc)

RF test accuracy:  0.577639751552795
XGB test accuracy:  0.5838509316770186
SVM test accuracy:  0.5714285714285714
GNB test accuracy:  0.515527950310559
KNN test accuracy:  0.5652173913043478


# Significance Analysis of Results

- Do the McNemar test here for each of the models (10?) to compare

# Comparison Across Datasets

- take urinary dataset models and try to predict on other dataset's test samples and vice versa
    - draw conclusions on the accuracy percentage and based on this, we can determine if the urinary biomarkers
        - are generalizeable to use on other data for predicting pancreatic cancer

# ------------- NON URINARY DATA WORK --------

### Normal pancreas RNA-seq data

In [131]:
# Normal pancreas dataset 1
markers = ['LYVE1','TFF1','REG1B','GPX1']

rawPancNormData1 = pd.read_csv("pancreaticNormalSeqData/GSE205163_znf808_ko_raw_counts_S0-S4.tsv",  sep='\t')
filtRawPancNormData1 = rawPancNormData1[rawPancNormData1['Gene'].str.endswith(('LYVE1','REG1B','TFF1','GPX1'))]
filtRawPancNormData1 = filtRawPancNormData1.set_index('Gene')
filtRawPancNormData1 = filtRawPancNormData1.transpose()
filtRawPancNormData1 = filtRawPancNormData1.reset_index(drop=True)
filtRawPancNormData1.columns = markers
filtRawPancNormData1.rename(columns={"GPX1": "Creatinine"}, inplace=True)
#print(filtRawPancNormData1)


filtRawPancNormData1Scaler = StandardScaler()
# transform data
scaledRawPancNormData1 = filtRawPancNormData1Scaler.fit_transform(filtRawPancNormData1)
scaledRawPancNormData1 = pd.DataFrame(scaledRawPancNormData1, columns=filtRawPancNormData1.columns)
scaledRawPancNormData1["Outcome"] = 0

print(scaledRawPancNormData1)


       LYVE1      TFF1     REG1B  Creatinine  Outcome
0   0.603023 -1.177057 -0.312348    0.761342        0
1  -0.527645 -1.177057 -0.312348    1.162652        0
2  -0.527645  2.075338 -0.312348    0.069336        0
3  -0.527645  0.449140  2.030259    0.732340        0
4  -0.527645 -0.247802 -0.312348   -0.348836        0
5  -0.527645  1.610710 -0.312348   -0.999700        0
6  -0.527645 -0.944743 -0.312348   -0.092537        0
7  -0.527645 -1.177057 -0.312348   -0.287459        0
8  -0.527645  1.378396 -0.312348   -0.514756        0
9  -0.527645 -0.015488 -0.312348   -1.301863        0
10  0.603023 -1.177057 -0.312348    1.524843        0
11  0.603023 -1.177057 -0.312348    0.634542        0
12 -0.527645  0.216826 -0.312348    0.202206        0
13 -0.527645  0.681454  4.372865   -0.212593        0
14  1.733690 -0.247802 -0.312348   -0.626718        0
15 -0.527645  1.378396 -0.312348   -0.043975        0
16 -0.527645 -0.247802 -0.312348   -0.173474        0
17 -0.527645 -0.712429 -0.31

In [134]:
# Normal pancreas dataset 2
rawPancNormData2 = pd.read_csv("pancreaticNormalSeqData/GSE216854_normalized_counts.txt", sep='\t')
filtRawPancNormData2 = rawPancNormData2[rawPancNormData2['gene'].isin(markers)]
filtRawPancNormData2 = filtRawPancNormData2.set_index('gene')
filtRawPancNormData2 = filtRawPancNormData2.transpose()
filtRawPancNormData2 = filtRawPancNormData2.reset_index(drop=True)
filtRawPancNormData2.rename(columns={"GPX1": "Creatinine"}, inplace=True)


columnOrder = ["LYVE1", "TFF1", "REG1B", "Creatinine"]

# Reorder the columns in the DataFrame
filtRawPancNormData2 = filtRawPancNormData2[columnOrder]
#print(filtRawPancNormData2)

filtRawPancNormData1Scaler2 = StandardScaler()
# transform data
scaledRawPancNormData2 = filtRawPancNormData1Scaler2.fit_transform(filtRawPancNormData2)
scaledRawPancNormData2 = pd.DataFrame(scaledRawPancNormData2, columns=filtRawPancNormData2.columns)
scaledRawPancNormData2["Outcome"] = 0
print(scaledRawPancNormData2)


gene     LYVE1      TFF1     REG1B  Creatinine  Outcome
0     1.212172 -0.414569 -0.222204   -0.728370        0
1    -0.601634  2.289087 -0.222204   -0.762900        0
2    -0.601634  1.401099 -0.222204   -0.905547        0
3     1.992331  1.282733 -0.222204   -1.322138        0
4    -0.601634  2.063121 -0.222204   -1.445888        0
5     0.924537  4.151805 -0.222204   -1.653851        0
6     2.169985  0.836218 -0.222204   -1.450046        0
7    -0.601634  1.137392 -0.222204   -1.582078        0
8     0.805753 -0.310619 -0.222204   -1.670140        0
9    -0.601634 -0.602931 -0.222204   -0.294469        0
10   -0.601634 -0.391989 -0.222204   -0.241374        0
11   -0.601634 -0.350597 -0.222204   -0.183727        0
12   -0.601634 -0.602931 -0.222204    0.497534        0
13   -0.601634 -0.602931 -0.222204    0.060106        0
14   -0.601634 -0.602931 -0.222204    0.028425        0
15    2.292054 -0.001918  3.931583    2.360045        0
16   -0.601634 -0.602931 -0.222204    1.325185  

In [136]:
# Normal pancreas dataset 3
rawPancNormData3 = pd.read_csv("pancreaticNormalSeqData/GSE228662_RNA_raw_read_counts.tsv", sep='\t')
filtRawPancNormData3 = rawPancNormData3[rawPancNormData3['symbol'].isin(markers)]
filtRawPancNormData3 = filtRawPancNormData3.set_index('symbol')
filtRawPancNormData3 = filtRawPancNormData3.drop(columns=['chrom','start','end','gene'])
filtRawPancNormData3 = filtRawPancNormData3.transpose()
filtRawPancNormData3 = filtRawPancNormData3.reset_index(drop=True)

filtRawPancNormData3.rename(columns={"GPX1": "Creatinine"}, inplace=True)
columnOrder = ["LYVE1", "TFF1", "REG1B", "Creatinine"]

# Reorder the columns in the DataFrame
filtRawPancNormData3 = filtRawPancNormData3[columnOrder]
#print(filtRawPancNormData3)

filtRawPancNormData1Scaler3 = StandardScaler()
# transform data
scaledRawPancNormData3 = filtRawPancNormData1Scaler3.fit_transform(filtRawPancNormData3)
scaledRawPancNormData3 = pd.DataFrame(scaledRawPancNormData3, columns=filtRawPancNormData3.columns)
scaledRawPancNormData3["Outcome"] = 0
print(scaledRawPancNormData3)


symbol     LYVE1      TFF1     REG1B  Creatinine  Outcome
0      -0.577293  0.366796 -0.160845    1.125271        0
1      -0.577293  1.197512 -0.160845    1.873122        0
2      -0.032524  0.482173 -0.160845    0.631829        0
3      -0.032524  0.020665 -0.160845    0.577313        0
4      -0.577293  0.424485 -0.160845    0.578711        0
..           ...       ...       ...         ...      ...
62     -0.577293 -0.521608 -0.160845   -0.412366        0
63      0.512246 -0.694674 -0.160845    0.041936        0
64     -0.577293 -0.821589 -0.160845    0.219463        0
65     -0.032524 -0.683136 -0.160845   -0.243226        0
66     -0.577293 -0.729287 -0.160845    0.075484        0

[67 rows x 5 columns]


In [137]:
# Concatenate all 3 normal pancreas datasets
#allPancNormData = pd.concat([filtRawPancNormData1, filtRawPancNormData2, filtRawPancNormData3], sort=False)
#print(allPancNormData)

allPancNormData = pd.concat([scaledRawPancNormData1, scaledRawPancNormData2, scaledRawPancNormData3], sort=False)
print(allPancNormData)

# NOTE: allPancNormData already has no zeros or NaN/Null, does not need imputing or more processing

       LYVE1      TFF1     REG1B  Creatinine  Outcome
0   0.603023 -1.177057 -0.312348    0.761342        0
1  -0.527645 -1.177057 -0.312348    1.162652        0
2  -0.527645  2.075338 -0.312348    0.069336        0
3  -0.527645  0.449140  2.030259    0.732340        0
4  -0.527645 -0.247802 -0.312348   -0.348836        0
..       ...       ...       ...         ...      ...
62 -0.577293 -0.521608 -0.160845   -0.412366        0
63  0.512246 -0.694674 -0.160845    0.041936        0
64 -0.577293 -0.821589 -0.160845    0.219463        0
65 -0.032524 -0.683136 -0.160845   -0.243226        0
66 -0.577293 -0.729287 -0.160845    0.075484        0

[139 rows x 5 columns]


### Pancreatic cancer RNA-seq data

In [138]:
rawPancCancData1 = pd.read_csv("pancreaticCancerSeqData/GSE232860_allsamples.deseq.normalized.counts.csv")
#print(rawPancCancData1.shape)


#Slow loading, 48,553 rows
rawPancCancData2 = pd.read_excel("pancreaticCancerSeqData/GSE245306_FKPM.xlsx")
#print(rawPancCancData2.shape)

#59050 rows
rawPancCancData3 = pd.read_csv("pancreaticCancerSeqData/tumor.counts.sub.tsv", sep='\t')
rawPancCancData3 = rawPancCancData3.reset_index()

#print(rawPancCancData3)

  warn(msg)


In [142]:
#Only has Reg1, not Reg1B or Reg1A, so we take Reg1 and apply it to both rows
rowsToKeep = ["Gpx1", "Lyve1", "Reg1", "Tff1", "Reg1"]
rawPancCancData1.rename(columns={'Unnamed: 0': 'GeneNames'}, inplace=True)
print(rawPancCancData1.shape)

filtRawPancCancData1 = rawPancCancData1[rawPancCancData1['GeneNames'].isin(rowsToKeep)]
filtRawPancCancData1 = filtRawPancCancData1.transpose()
filtRawPancCancData1.reset_index(inplace=True, drop=True)
filtRawPancCancData1.columns = filtRawPancCancData1.iloc[0]
filtRawPancCancData1 = filtRawPancCancData1[1:]

filtRawPancCancData1.rename(columns={'Reg1': 'REG1B', "Gpx1": "Creatinine", "Tff1": "TFF1", "Lyve1": "LYVE1"}, inplace=True)
columnOrder = ["LYVE1", "TFF1", "REG1B", "Creatinine"]

# Reorder the columns in the DataFrame
filtRawPancCancData1 = filtRawPancCancData1[columnOrder]

# Impute missing values in each column if needed
filtRawPancCancData1 = filtRawPancCancData1.fillna(filtRawPancCancData1.median())
#print(filtRawPancCancData1)


filtRawPancCancData1Scaler = StandardScaler()
# transform data
scaledRawPancCancData1 = filtRawPancCancData1Scaler.fit_transform(filtRawPancCancData1)
scaledRawPancCancData1 = pd.DataFrame(scaledRawPancCancData1, columns=filtRawPancCancData1.columns)
scaledRawPancCancData1["Outcome"] = 1
print(scaledRawPancCancData1)





(17966, 23)
0      LYVE1      TFF1     REG1B  Creatinine  Outcome
0  -0.315544  2.008313 -0.543365   -0.046518        1
1  -0.581683  2.415103 -0.534482   -0.740781        1
2  -0.585285  2.299507 -0.489057    0.271683        1
3  -0.750723  0.599046 -0.452496   -0.729316        1
4   0.046597 -0.554481 -0.160474    1.918426        1
5   0.371750  0.324675 -0.486279   -0.629386        1
6   0.614754 -0.618860 -0.518384    0.311472        1
7   3.955835 -0.684014  0.051366   -1.351388        1
8   0.056674 -0.602712 -0.521555   -0.034005        1
9   1.293589 -0.056772 -0.390424   -1.443193        1
10  0.034731  0.592024 -0.530837    0.678888        1
11 -0.164512  0.366374 -0.540116   -0.737920        1
12 -1.064816 -0.754403  0.276978    0.695200        1
13 -0.262819 -0.501485  0.182592   -0.799021        1
14 -0.160552  0.059531  0.173587    0.652034        1
15 -0.479314 -0.754403 -0.502960    0.270555        1
16 -0.292546 -0.754403  4.248813   -0.811870        1
17 -0.481052 -0.

In [146]:
#print(rawPancCancData2)
rowsToKeep = ["Gpx1", "Lyve1", "Reg1", "Tff1"]

filtRawPancCancData2 = rawPancCancData2[rawPancCancData2['Gene'].isin(rowsToKeep)]
#print(filtRawPancCancData2.shape)
#print(filtRawPancCancData2)

filtRawPancCancData2 = filtRawPancCancData2.transpose()
#print(filtRawPancCancData2)
filtRawPancCancData2.reset_index(inplace=True, drop=True)
#print(filtRawPancCancData2)
#print(filtRawPancCancData2.shape)

filtRawPancCancData2.columns = filtRawPancCancData2.iloc[0]
filtRawPancCancData2 = filtRawPancCancData2[1:]
filtRawPancCancData2 = filtRawPancCancData2.drop([1, 2])  # Remove the second and third rows

filtRawPancCancData2.rename(columns={'Reg1': 'REG1B', "Gpx1": "Creatinine", "Tff1": "TFF1", "Lyve1": "LYVE1"}, inplace=True)
columnOrder = ["LYVE1", "TFF1", "REG1B", "Creatinine"]

# Reorder the columns in the DataFrame
filtRawPancCancData2 = filtRawPancCancData2[columnOrder]
#print(filtRawPancCancData2)
#print(filtRawPancCancData2.shape)

# Impute missing values in each column if needed
filtRawPancCancData2 = filtRawPancCancData2.fillna(filtRawPancCancData2.median())
#print(filtRawPancCancData2)


filtRawPancCancData1Scaler2 = StandardScaler()
# transform data
scaledRawPancCancData2 = filtRawPancCancData1Scaler2.fit_transform(filtRawPancCancData2)
scaledRawPancCancData2 = pd.DataFrame(scaledRawPancCancData2, columns=filtRawPancCancData2.columns)
scaledRawPancCancData2["Outcome"] = 1
print(scaledRawPancCancData2)



0     LYVE1      TFF1     REG1B  Creatinine  Outcome
0  0.032969  1.550013  2.945148    1.319633        1
1  2.855572 -0.558612 -0.428273    2.151116        1
2 -0.611502 -0.558612 -0.440346   -0.747143        1
3 -0.611502 -0.558612 -0.440346   -1.003820        1
4 -0.611502 -0.558612 -0.440346   -0.983581        1
5 -0.608904 -0.558612 -0.440346   -0.753642        1
6  0.149489 -0.363988 -0.017810    0.207282        1
7  0.177499 -0.389758 -0.430109    0.413563        1
8 -0.393700 -0.372597 -0.413068   -0.560482        1
9 -0.378419  2.369388  0.105496   -0.042926        1


In [150]:
rowsToKeep = ["GPX1", "LYVE1", "REG1B", "TFF1"]
#print(rawPancCancData3)


filtRawPancCancData3 = rawPancCancData3[rawPancCancData3['index'].isin(rowsToKeep)]
#print(filtRawPancCancData3.shape)
#print(filtRawPancCancData3)

filtRawPancCancData3 = filtRawPancCancData3.transpose()
#print(filtRawPancCancData3)
filtRawPancCancData3.reset_index(inplace=True, drop=True)
#print(filtRawPancCancData3)
#print(filtRawPancCancData3.shape)


filtRawPancCancData3.columns = filtRawPancCancData3.iloc[0]
filtRawPancCancData3 = filtRawPancCancData3[1:]

filtRawPancCancData3.rename(columns={'Reg1': 'REG1B', "GPX1": "Creatinine", "Tff1": "TFF1", "Lyve1": "LYVE1"}, inplace=True)
columnOrder = ["LYVE1", "TFF1", "REG1B", "Creatinine"]
# Reorder the columns in the DataFrame
filtRawPancCancData3 = filtRawPancCancData3[columnOrder]#print(filtRawPancCancData3)
#print(filtRawPancCancData3.shape)

# Impute missing values in each column if needed
filtRawPancCancData3 = filtRawPancCancData3.fillna(filtRawPancCancData3.median())
#print(filtRawPancCancData3)


filtRawPancCancData1Scaler3 = StandardScaler()
# transform data
scaledRawPancCancData3 = filtRawPancCancData1Scaler3.fit_transform(filtRawPancCancData3)
scaledRawPancCancData3 = pd.DataFrame(scaledRawPancCancData3, columns=filtRawPancCancData3.columns)
scaledRawPancCancData3["Outcome"] = 1
print(scaledRawPancCancData3)



0      LYVE1      TFF1     REG1B  Creatinine  Outcome
0   0.414311 -0.426343 -0.236257   -0.017511        1
1  -0.650777 -0.410380  2.585011   -0.327887        1
2  -0.604525 -0.274486 -0.232637    0.429868        1
3  -0.677115 -0.231455 -0.035300   -0.620078        1
4  -0.711162  1.793484 -0.236383    0.825719        1
..       ...       ...       ...         ...      ...
74 -0.671334 -0.438558 -0.175120   -1.329337        1
75  0.529300  0.740066 -0.236383    0.123129        1
76 -0.502384 -0.402745  0.258267   -1.282659        1
77 -0.117591 -0.433005 -0.236358   -0.274541        1
78  2.821360 -0.438558 -0.236383   -0.073282        1

[79 rows x 5 columns]


In [157]:

allPancCancData = pd.concat([scaledRawPancCancData1, scaledRawPancCancData2, scaledRawPancCancData3], sort=False)
print(type(allPancCancData))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [158]:
ncbiDataset = pd.concat([allPancNormData, allPancCancData], sort=False)
print(ncbiDataset)

       LYVE1      TFF1     REG1B  Creatinine  Outcome
0   0.603023 -1.177057 -0.312348    0.761342        0
1  -0.527645 -1.177057 -0.312348    1.162652        0
2  -0.527645  2.075338 -0.312348    0.069336        0
3  -0.527645  0.449140  2.030259    0.732340        0
4  -0.527645 -0.247802 -0.312348   -0.348836        0
..       ...       ...       ...         ...      ...
74 -0.671334 -0.438558 -0.175120   -1.329337        1
75  0.529300  0.740066 -0.236383    0.123129        1
76 -0.502384 -0.402745  0.258267   -1.282659        1
77 -0.117591 -0.433005 -0.236358   -0.274541        1
78  2.821360 -0.438558 -0.236383   -0.073282        1

[250 rows x 5 columns]
