In [3]:
import pandas as pd

# Load the dataset
file_path = r"B:\sap\Capstone project\thyroid\thyroid_data.xlsx"
data = pd.read_excel(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Sno                   384 non-null    int64  
 1   Filename              384 non-null    object 
 2   number                384 non-null    int64  
 3   age                   300 non-null    float64
 4   sex                   301 non-null    object 
 5   composition           197 non-null    object 
 6   echogenicity          197 non-null    object 
 7   margins               294 non-null    object 
 8   calcifications        296 non-null    object 
 9   tirads                295 non-null    object 
 10  Malignant_percentage  384 non-null    float64
 11  Cancer_Risk           384 non-null    float64
dtypes: float64(3), int64(2), object(7)
memory usage: 36.1+ KB


(   Sno   Filename  number   age  sex          composition       echogenicity  \
 0    1  100_1.jpg       1   NaN  NaN                  NaN                NaN   
 1    2  101_1.jpg      10  74.0    F                solid  hyperechogenicity   
 2    3  102_1.jpg     100  39.0    F  predominantly solid    isoechogenicity   
 3    4  103_1.jpg     101  40.0    M                solid   hypoechogenicity   
 4    5  104_1.jpg     102  28.0    F                solid    isoechogenicity   
 
         margins       calcifications tirads  Malignant_percentage  Cancer_Risk  
 0           NaN                  NaN    NaN                  0.72         0.72  
 1    spiculated  microcalcifications     4b                  0.35         1.00  
 2  well defined  macrocalcifications     4a                  0.86         1.00  
 3  well defined  microcalcifications      5                  0.62         1.00  
 4  well defined  microcalcifications     4b                  0.55         0.83  ,
 None)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle

# Drop unnecessary columns
data = data.drop(columns=["Sno", "Filename"])

# Define target and features
target = "Cancer_Risk"
X = data.drop(columns=[target])
y = data[target]

# Handle missing values and encode categorical variables
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])

# Fit the model
model_pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Save the model as a .pkl file
model_path = r"B:\sap\Capstone project\thyroid\cancer_risk_model.pkl"
with open(model_path, "wb") as file:
    pickle.dump(model_pipeline, file)

mse, mae, r2, model_path


(0.001068831168831169,
 0.015974025974025977,
 0.983302346856717,
 'B:\\sap\\Capstone project\\thyroid\\cancer_risk_model.pkl')