In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

Read the data

In [2]:
data = pd.read_csv("hw.0402.tsv", sep='\t', header=0)

In [3]:
data = data.drop(columns = ["pert_id","sig_id"], axis=1)
data.head()

Unnamed: 0,bead_batch,pert_dose,pert_dose_unit,pert_time,pert_time_unit,nsample,tas,pert_type,cell_iname,det_wells,cmap_name
0,b25,0.213856,uM,24.0,h,2.0,0.112658,trt_cp,YAPC,C04,MD-920
1,b23,1.50585,uM,24.0,h,3.0,0.127106,trt_cp,YAPC,O08,NSC-4644
2,b23,1.0,uM,24.0,h,3.0,0.356014,trt_cp,YAPC,D01,REV-5901
3,b25,1.11914,uM,24.0,h,3.0,0.087267,trt_cp,A549,E15,picolinic-acid
4,b24,3.8489,uM,24.0,h,3.0,0.053473,trt_cp,MCF7,M20,swainsonine


1)Identify categorical and numerical features

In [4]:
categorical_features = data.select_dtypes(include=['object']).columns
numerical_features = data.select_dtypes(include=['number']).columns
target = "tas" 

Encode categories

In [5]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_categorical_data = encoder.fit_transform(data[categorical_features])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_features))



In [6]:
encoded_categorical_df.head()


Unnamed: 0,bead_batch_b22,bead_batch_b23,bead_batch_b24,bead_batch_b25,bead_batch_b27,bead_batch_b29,bead_batch_b32,pert_dose_unit_uM,pert_dose_unit_nan,pert_time_unit_h,...,cmap_name_zaprinast,cmap_name_zardaverine,cmap_name_zidovudine,cmap_name_zileuton,cmap_name_ziprasidone,cmap_name_zofenopril-calcium,cmap_name_zolpidem,cmap_name_zonisamide,cmap_name_zosuquidar,cmap_name_nan
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
data_encoded = pd.concat([data[numerical_features], encoded_categorical_df], axis=1)

In [9]:
data_encoded.head()

Unnamed: 0,pert_dose,pert_time,nsample,tas,bead_batch_b22,bead_batch_b23,bead_batch_b24,bead_batch_b25,bead_batch_b27,bead_batch_b29,...,cmap_name_zaprinast,cmap_name_zardaverine,cmap_name_zidovudine,cmap_name_zileuton,cmap_name_ziprasidone,cmap_name_zofenopril-calcium,cmap_name_zolpidem,cmap_name_zonisamide,cmap_name_zosuquidar,cmap_name_nan
0,0.213856,24.0,2.0,0.112658,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.50585,24.0,3.0,0.127106,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,24.0,3.0,0.356014,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.11914,24.0,3.0,0.087267,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.8489,24.0,3.0,0.053473,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2) Split the data

In [10]:
X = data_encoded.drop(target, axis=1)
y = data_encoded[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,pert_dose,pert_time,nsample,bead_batch_b22,bead_batch_b23,bead_batch_b24,bead_batch_b25,bead_batch_b27,bead_batch_b29,bead_batch_b32,...,cmap_name_zaprinast,cmap_name_zardaverine,cmap_name_zidovudine,cmap_name_zileuton,cmap_name_ziprasidone,cmap_name_zofenopril-calcium,cmap_name_zolpidem,cmap_name_zonisamide,cmap_name_zosuquidar,cmap_name_nan
64078,0.123457,24.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72721,0.123457,24.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43564,0.37037,24.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32684,1.11111,24.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76032,3.33333,24.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


3) Use model Linear Regression

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

MSE on data train and test

In [13]:
mean_squared_error(y_train, model.predict(X_train)) 

0.008216643460455485

In [27]:
X_test = X_test.dropna()
y_test = y_test.dropna()
mean_squared_error(y_test, model.predict(X_test))

0.008484129986118274

K-fold

In [21]:
kf = KFold(n_splits=5)
cv_errors = []

for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model_cv = LinearRegression()
    model_cv.fit(X_train_cv, y_train_cv)
    
    y_val_pred = model_cv.predict(X_val_cv)
    cv_error = mean_squared_error(y_val_cv, y_val_pred)
    cv_errors.append(cv_error)

cv_error = np.mean(cv_errors)

In [22]:
cv_error

0.008713153726032813

In [25]:
# 6. Prepare the 5-fold cross-validation splits for your train data and compute the mean-CV error
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_errors = []
X_test = X_test.dropna()
y_test = y_test.dropna()
for test_index, val_index in kf.split(X_test):
    X_train_cv, X_val_cv = X_test.iloc[test_index], X_test.iloc[val_index]
    y_train_cv, y_val_cv = y_test.iloc[test_index], y_test.iloc[val_index]
    
    model_cv = LinearRegression()
    model_cv.fit(X_train_cv, y_train_cv)
    
    y_val_pred = model_cv.predict(X_val_cv)
    cv_error = mean_squared_error(y_val_cv, y_val_pred)
    cv_errors.append(cv_error)

test_cv_error = np.mean(cv_errors)

In [26]:
test_cv_error

0.009472923947804536