In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler

In [6]:
clinical_data=pd.read_csv("C:/Users/manid/Downloads/full_clinical_file.csv")

In [7]:
print(clinical_data)

     Case ID Patient affiliation  Age at Histological Diagnosis  \
0    AMC-001            Stanford                             34   
1    AMC-002            Stanford                             33   
2    AMC-003            Stanford                             69   
3    AMC-004            Stanford                             80   
4    AMC-005            Stanford                             76   
..       ...                 ...                            ...   
206  R01-159            Stanford                             75   
207  R01-160                  VA                             61   
208  R01-161            Stanford                             52   
209  R01-162            Stanford                             67   
210  R01-163                  VA                             68   

      Weight (lbs)  Gender                 Ethnicity Smoking status  \
0    Not Collected    Male  Not Recorded In Database      Nonsmoker   
1    Not Collected  Female  Not Recorded In Database 

In [8]:
#features in the dataset
clinical_data.columns

Index(['Case ID', 'Patient affiliation', 'Age at Histological Diagnosis',
       'Weight (lbs)', 'Gender', 'Ethnicity', 'Smoking status', 'Pack Years',
       'Quit Smoking Year', '%GG', 'Tumor Location (choice=RUL)',
       'Tumor Location (choice=RML)', 'Tumor Location (choice=RLL)',
       'Tumor Location (choice=LUL)', 'Tumor Location (choice=LLL)',
       'Tumor Location (choice=L Lingula)', 'Tumor Location (choice=Unknown)',
       'Histology ', 'Pathological T stage', 'Pathological N stage',
       'Pathological M stage', 'Histopathological Grade',
       'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Recurrence Location', 'Date of Recurrence',
       'Date of Last Known Alive', 'Survival Status', 'Date of Death',
       'Time to Death (days)', 'CT Date', 'Days between CT and sur

In [9]:
#no of rows in the dataset
clinical_data.index

RangeIndex(start=0, stop=211, step=1)

In [10]:
#information about each feature
clinical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 40 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Case ID                                            211 non-null    object 
 1   Patient affiliation                                211 non-null    object 
 2   Age at Histological Diagnosis                      211 non-null    int64  
 3   Weight (lbs)                                       211 non-null    object 
 4   Gender                                             211 non-null    object 
 5   Ethnicity                                          211 non-null    object 
 6   Smoking status                                     211 non-null    object 
 7   Pack Years                                         163 non-null    object 
 8   Quit Smoking Year                                  108 non-null    float64
 9   %GG       

In [11]:
clinical_data["Smoking status"].value_counts()

Former       130
Nonsmoker     48
Current       33
Name: Smoking status, dtype: int64

In [12]:
clinical_data["Gender"].value_counts()

Male      135
Female     76
Name: Gender, dtype: int64

In [13]:
clinical_data["Recurrence"].value_counts()
#replacing Not checked with No
clinical_data["Recurrence"]=clinical_data["Recurrence"].replace("Not collected","no")
clinical_data["Recurrence"].value_counts()

no     157
yes     54
Name: Recurrence, dtype: int64

In [14]:
clinical_data["Survival Status"].value_counts()

Alive    148
Dead      63
Name: Survival Status, dtype: int64

In [15]:
clinical_data["Radiation"].value_counts()
#replacing Not checked with No
clinical_data["Radiation"]=clinical_data["Radiation"].replace("Not Collected","No")
clinical_data["Radiation"].value_counts()

No     195
Yes     16
Name: Radiation, dtype: int64

In [16]:
clinical_data["Chemotherapy"].value_counts()
#replacing Not checked with No
clinical_data["Chemotherapy"]=clinical_data["Chemotherapy"].replace("Not Collected","No")
clinical_data["Chemotherapy"].value_counts()

No     162
Yes     49
Name: Chemotherapy, dtype: int64

In [17]:
#EGFR mutation status
clinical_data["EGFR mutation status"].value_counts()
#replacing Not checked with No
clinical_data["EGFR mutation status"]=clinical_data["EGFR mutation status"].replace("Not collected","Wildtype")
clinical_data["EGFR mutation status"]=clinical_data["EGFR mutation status"].replace("Unknown","Wildtype")
clinical_data["EGFR mutation status"].value_counts()

Wildtype    168
Mutant       43
Name: EGFR mutation status, dtype: int64

In [18]:
#KRAS mutation status
clinical_data["KRAS mutation status"].value_counts()
#replacing Not checked with No
clinical_data["KRAS mutation status"]=clinical_data["KRAS mutation status"].replace("Not collected","Wildtype")
clinical_data["KRAS mutation status"]=clinical_data["KRAS mutation status"].replace("Unknown","Wildtype")
clinical_data["KRAS mutation status"].value_counts()

Wildtype    173
Mutant       38
Name: KRAS mutation status, dtype: int64

In [19]:
#ALK translocation status
clinical_data["ALK translocation status"].value_counts()
#replacing Not checked with No
clinical_data["ALK translocation status"]=clinical_data["ALK translocation status"].replace("Not collected","Wildtype")
clinical_data["ALK translocation status"]=clinical_data["ALK translocation status"].replace("Unknown","Wildtype")
clinical_data["ALK translocation status"].value_counts()

Wildtype        209
Translocated      2
Name: ALK translocation status, dtype: int64

In [20]:
#Histology
clinical_data["Histology "].value_counts()
clinical_data["Histology "]=clinical_data["Histology "].replace("NSCLC NOS (not otherwise specified)","Adenocarcinoma")
clinical_data["Histology "].value_counts()

Adenocarcinoma             176
Squamous cell carcinoma     35
Name: Histology , dtype: int64

# Data Preprocessing

In [21]:
#Smoking Status
# Define the mapping
smoking_status_mapping = {'Nonsmoker': 0, 'Former': 1, 'Current': 2}

# Apply the mapping to the 'Smoking status' column
clinical_data['Smoking status'] = clinical_data['Smoking status'].map(smoking_status_mapping)

In [22]:
#Recurrence
# label_encoder object knows  

label_encoder = preprocessing.LabelEncoder()   
# Encode labels in column 'species'. 
clinical_data['Recurrence']= label_encoder.fit_transform(clinical_data['Recurrence']) 

In [23]:
#Raditaion
# label_encoder object knows  
label_encoder = preprocessing.LabelEncoder()   
# Encode labels in column 'species'. 
clinical_data['Radiation']= label_encoder.fit_transform(clinical_data['Radiation']) 

In [24]:
#Gender
# label_encoder object knows  
label_encoder = preprocessing.LabelEncoder()   
# Encode labels in column 'species'. 
clinical_data['Gender']= label_encoder.fit_transform(clinical_data['Gender']) 

In [25]:
#survival status
# label_encoder object knows  
label_encoder = preprocessing.LabelEncoder()   
# Encode labels in column 'species'. 
clinical_data['Survival Status']= label_encoder.fit_transform(clinical_data['Survival Status']) 

In [26]:
#chemotherapy
# label_encoder object knows  
label_encoder = preprocessing.LabelEncoder()   
# Encode labels in column 'species'. 
clinical_data['Chemotherapy']= label_encoder.fit_transform(clinical_data['Chemotherapy']) 

In [27]:
#Histology
# label_encoder object knows  
label_encoder = preprocessing.LabelEncoder()   
# Encode labels in column 'species'. 
clinical_data['Histology ']= label_encoder.fit_transform(clinical_data['Histology ']) 

In [28]:
clinical_data

Unnamed: 0,Case ID,Patient affiliation,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,Quit Smoking Year,%GG,...,Recurrence,Recurrence Location,Date of Recurrence,Date of Last Known Alive,Survival Status,Date of Death,Time to Death (days),CT Date,Days between CT and surgery,PET Date
0,AMC-001,Stanford,34,Not Collected,1,Not Recorded In Database,0,,,Not Assessed,...,1,distant,10/7/1994,1/7/1997,1,1/7/1997,872.0,8/10/1994,9,Not Collected
1,AMC-002,Stanford,33,Not Collected,0,Not Recorded In Database,0,,,Not Assessed,...,0,,,3/20/1992,0,,,2/19/1992,3,Not Collected
2,AMC-003,Stanford,69,Not Collected,0,Not Recorded In Database,0,,,Not Assessed,...,0,,,6/19/1996,0,,,2/23/1995,28,Not Collected
3,AMC-004,Stanford,80,Not Collected,0,Not Recorded In Database,0,,,Not Assessed,...,0,,,12/13/1996,0,,,12/26/1992,47,Not Collected
4,AMC-005,Stanford,76,Not Collected,1,Not Recorded In Database,1,30,1962.0,Not Assessed,...,1,distant,1/4/1996,1/7/1997,0,,,7/21/1994,2,Not Collected
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,R01-159,Stanford,75,184,1,Caucasian,1,55,1994.0,Not Assessed,...,0,,,7/13/1995,0,,,11/24/1994,14,11/16/1994
207,R01-160,VA,61,231.5,1,Caucasian,1,12,1993.0,Not Assessed,...,0,,,7/3/1999,0,,,8/12/1993,72,9/22/1993
208,R01-161,Stanford,52,Not Collected,0,Caucasian,1,7,,Not Assessed,...,0,,,4/2/1999,0,,,12/13/1995,8,9/26/1995
209,R01-162,Stanford,67,158,1,Asian,1,15,1966.0,Not Assessed,...,0,,,10/8/1997,1,10/8/1997,671.0,10/3/1995,65,11/14/1995


In [29]:
one_hot_encoded_data = pd.get_dummies(clinical_data, columns = ['EGFR mutation status','KRAS mutation status','ALK translocation status']) 
print(one_hot_encoded_data.head(2))

   Case ID Patient affiliation  Age at Histological Diagnosis   Weight (lbs)  \
0  AMC-001            Stanford                             34  Not Collected   
1  AMC-002            Stanford                             33  Not Collected   

   Gender                 Ethnicity  Smoking status Pack Years  \
0       1  Not Recorded In Database               0        NaN   
1       0  Not Recorded In Database               0        NaN   

   Quit Smoking Year           %GG  ... Time to Death (days)    CT Date  \
0                NaN  Not Assessed  ...                872.0  8/10/1994   
1                NaN  Not Assessed  ...                  NaN  2/19/1992   

  Days between CT and surgery       PET Date EGFR mutation status_Mutant  \
0                           9  Not Collected                           0   
1                           3  Not Collected                           0   

  EGFR mutation status_Wildtype KRAS mutation status_Mutant  \
0                             1           

In [30]:
one_hot_encoded_data.columns

Index(['Case ID', 'Patient affiliation', 'Age at Histological Diagnosis',
       'Weight (lbs)', 'Gender', 'Ethnicity', 'Smoking status', 'Pack Years',
       'Quit Smoking Year', '%GG', 'Tumor Location (choice=RUL)',
       'Tumor Location (choice=RML)', 'Tumor Location (choice=RLL)',
       'Tumor Location (choice=LUL)', 'Tumor Location (choice=LLL)',
       'Tumor Location (choice=L Lingula)', 'Tumor Location (choice=Unknown)',
       'Histology ', 'Pathological T stage', 'Pathological N stage',
       'Pathological M stage', 'Histopathological Grade',
       'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'Adjuvant Treatment', 'Chemotherapy', 'Radiation', 'Recurrence',
       'Recurrence Location', 'Date of Recurrence', 'Date of Last Known Alive',
       'Survival Status', 'Date of Death', 'Time to Death (days)', 'CT Date',
       'Days between CT and surgery', 'PET Date',
       'EGFR mutation status_Mutant', 'EGFR mutation status_Wild

In [31]:
X_data=one_hot_encoded_data.loc[:,['Age at Histological Diagnosis','Gender','Smoking status','Histology ','EGFR mutation status_Mutant', 'EGFR mutation status_Wildtype','KRAS mutation status_Mutant', 'KRAS mutation status_Wildtype',
       'ALK translocation status_Translocated',
       'ALK translocation status_Wildtype','Chemotherapy', 'Radiation', 'Survival Status']]
Y_label=one_hot_encoded_data.loc[:,["Recurrence"]]

In [32]:
#splitting dataset into train and test
xtrain,xtest,ytrain,ytest=train_test_split(X_data,Y_label,test_size=0.2)

In [33]:
xtrain.shape


(168, 13)

In [34]:
xtest.shape

(43, 13)

In [35]:
ytrain.shape

(168, 1)

In [36]:
ytest.shape

(43, 1)

In [37]:
#building model for Naive bayes--
model=MultinomialNB(alpha=0.5)
model.fit(xtrain,ytrain)

  return f(*args, **kwargs)


MultinomialNB(alpha=0.5)

In [38]:
#predicting the testdata using the above model
pred=model.predict(xtest)

In [39]:
#building confusion matrix 
cf_matrix=confusion_matrix(ytest,pred)

In [40]:
#accuracy of the model
model.score(xtest,ytest)

0.6976744186046512

In [41]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(xtrain, ytrain)

# Make predictions on the test set
ypred = rf_model.predict(xtest)

# Evaluate the model
rf_model.score(xtest,ytest)

  rf_model.fit(xtrain, ytrain)


0.6511627906976745

In [45]:
#DL
scaler = StandardScaler()
X_train = scaler.fit_transform(xtrain)
X_test = scaler.transform(xtest)

# Build the neural network model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')  # Use 'sigmoid' for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, ytrain, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, ytest)
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.6744186282157898
