In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/CDJ Health Dataset/GDSC-dataset.csv'
df = pd.read_csv(file_path)

# Show the first few rows of the dataset to understand its structure
df.head()



Unnamed: 0,COSMIC_ID,CELL_LINE_NAME,TCGA_DESC,DRUG_ID,DRUG_NAME,LN_IC50,AUC,Z_SCORE,GDSC Tissue descriptor 1,GDSC Tissue descriptor 2,Cancer Type (matching TCGA label),Microsatellite instability Status (MSI),Screen Medium,Growth Properties,CNA,Gene Expression,Methylation,TARGET,TARGET_PATHWAY
0,683667,PFSK-1,MB,1003,Camptothecin,-1.463887,0.93022,0.433123,nervous_system,medulloblastoma,MB,MSS/MSI-L,R,Adherent,Y,Y,Y,TOP1,DNA replication
1,684057,ES5,UNCLASSIFIED,1003,Camptothecin,-3.360586,0.791072,-0.599569,bone,ewings_sarcoma,,MSS/MSI-L,R,Adherent,Y,Y,Y,TOP1,DNA replication
2,684059,ES7,UNCLASSIFIED,1003,Camptothecin,-5.04494,0.59266,-1.516647,bone,ewings_sarcoma,,MSS/MSI-L,R,Adherent,Y,Y,Y,TOP1,DNA replication
3,684062,EW-11,UNCLASSIFIED,1003,Camptothecin,-3.741991,0.734047,-0.807232,bone,ewings_sarcoma,,MSS/MSI-L,R,Adherent,Y,Y,Y,TOP1,DNA replication
4,684072,SK-ES-1,UNCLASSIFIED,1003,Camptothecin,-5.142961,0.582439,-1.570016,bone,ewings_sarcoma,,MSS/MSI-L,R,Semi-Adherent,Y,Y,Y,TOP1,DNA replication


In [None]:
# Data cleaning: handle missing values and encode categorical variables

# Check for missing values
missing_values = df.isnull().sum()

# Drop rows with missing target (LN_IC50) or drug/cancer info
df_clean = df.dropna(subset=['LN_IC50', 'DRUG_NAME', 'TCGA_DESC'])

# Encode categorical variables (Drug name and Cancer type) as categorical types
df_clean['DRUG_NAME'] = df_clean['DRUG_NAME'].astype('category')
df_clean['TCGA_DESC'] = df_clean['TCGA_DESC'].astype('category')

# Features and target variable
X = df_clean[['DRUG_NAME', 'TCGA_DESC']]  # Predictor variables
y = df_clean['LN_IC50']  # Target variable (IC50)

# Encode categorical features to numeric codes
X['DRUG_NAME'] = X['DRUG_NAME'].cat.codes
X['TCGA_DESC'] = X['TCGA_DESC'].cat.codes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['DRUG_NAME'] = df_clean['DRUG_NAME'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['TCGA_DESC'] = df_clean['TCGA_DESC'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['DRUG_NAME'] = X['DRUG_NAME'].cat.codes
A value is trying to be set on a 

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train a model (Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(1.7740698598519185, 0.7670300574088964)

In [None]:
import numpy as np

# To make predictions, use the model with encoded input
def predict_ic50(drug_name, cancer_type):
    # Ensure the drug and cancer are recognized as categorical and get their codes
    drug_code = df_clean['DRUG_NAME'].cat.codes[df_clean['DRUG_NAME'].cat.categories.get_loc(drug_name)]
    cancer_code = df_clean['TCGA_DESC'].cat.codes[df_clean['TCGA_DESC'].cat.categories.get_loc(cancer_type)]

    # Prepare the input data for prediction
    input_data = [[drug_code, cancer_code]]

    # Predict the IC50 value (natural log scale)
    predicted_ln_ic50 = model.predict(input_data)

    # Convert predicted IC50 back to original scale (if desired)
    predicted_ic50 = np.exp(predicted_ln_ic50[0])  # Exponentiate the log-transformed IC50 value

    # Print the input drug, cancer type, and predicted IC50 value
    print(f"Drug: {drug_name}")
    print(f"Cancer Type: {cancer_type}")
    print(f"Predicted IC50 (on original scale): {predicted_ic50:.4f}")

# Example usage: Predict IC50 for a specific drug and cancer type
predict_ic50('Vinblastine', 'SCLC')



Drug: Vinblastine
Cancer Type: SCLC
Predicted IC50 (on original scale): 0.2927


