In [219]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import joblib
import numpy as np


# alzheimer data and its training

In [222]:
url_alzheimer = "https://drive.google.com/file/d/1JSchS-yltPrM6Jp_1U76-uv-QS8kc-hA/view?usp=share_link"
path_alzheimer = 'https://drive.google.com/uc?export=download&id='+url_alzheimer.split('/')[-2]
alzheimer = pd.read_csv(path_alzheimer)

# For those who considered to be 'Converted' will be regarded as 'Demented'
alzheimer.loc[alzheimer["Group"] == "Converted", "Group"] = "Demented"

alzheimer.rename(columns = {"Group": "alzheimer", "M/F": "sex", "Age":"age", "EDUC": "years_of_education",
                            "SES": "socioeconomic_status", "MMSE": "mental_state_examination", 
                            "CDR": "clinical_dementia_rating", "eTIV": "intracranial_volume", 
                            "nWBV": "norm_brain_volume", "ASF": "atlas_scaling" }, inplace = True)

alzheimer.alzheimer = alzheimer.alzheimer.map({"Demented": 1,"Nondemented": 0})
alzheimer.to_csv("final_alzheimer.csv", index = False)
# X and y creation
X = alzheimer
y = alzheimer.pop("alzheimer")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train.head()

ord_cols = ["socioeconomic_status", "mental_state_examination", "clinical_dementia_rating"]

qualities1 = [1.0, 2.0, 3.0, 4.0, 5.0]
qualities2 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
              11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0,
              21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]
qualities3 = [0.0, 0.5, 1.0, 2.0, 3.0]

ord_cols_categories = [qualities1] + [qualities2] + [qualities3]

cat_cols = ["sex"]

num_cols = (
    X_train
    .drop(columns=ord_cols + cat_cols)
    .columns)

# ordinal pipeline
ord_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_cols_categories,
                               handle_unknown="use_encoded_value",
                               unknown_value=-1))
])

# nominal pipeline
cat_pipe = Pipeline(steps=[
    #('imputer', SimpleImputer(fill_value="missing")),
    ('onehot', OneHotEncoder(handle_unknown="ignore", drop="first"))
])

# numeric pipeline
num_pipe = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', MinMaxScaler())
])

# preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
        ('ord', ord_pipe, ord_cols),
        ('nom', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
])

reg = LogisticRegression(solver="liblinear", C=10.0, max_iter=1000, random_state=0)

model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    #('var_threshold', VarianceThreshold()),
    #('mod_feature_sel', SelectFromModel(RandomForestRegressor())),
    # apply different models here
    ('regressor', reg)
])

model_pipe.fit(X_train, y_train)
joblib.dump(model_pipe, 'model_pipeline_alzheimer.pkl')

['model_pipeline_alzheimer.pkl']

# Lung cancer data and its training

In [243]:
url_lungcancer= "https://drive.google.com/file/d/1_MTIc5RM4zA3b9G1KIx3Qhbg2dy9wk8u/view?usp=share_link"
path_lungcancer = 'https://drive.google.com/uc?export=download&id='+url_lungcancer.split('/')[-2]
lung_cancer = pd.read_csv(path_lungcancer)

# import the dataset
#lung_cancer = pd.read_csv('/Users/baeyeeun/Desktop/DataScienceBootCamp/fianl_project/LungCancer.csv')

# Drop NA values
lung_cancer.dropna(inplace=True) # no missing values

# Convert the LUNG_CANCER values into binary -- NO: False, YES: True
lung_cancer["LUNG_CANCER"] = lung_cancer["LUNG_CANCER"].map({"YES":True, "NO":False})

# Convert symptom variable values into binary -- 1:0, 2:1 so that it could be recognized as True/False values
#lung_cancer.iloc[:,2:-2] = lung_cancer.iloc[:,2:-2].replace({1:0, 2:1})

# Make age into classes by decade
#lung_cancer['AGE']= pd.cut(lung_cancer.AGE, bins=[0,10,20,30,40,50,60,70,80,90,100], labels=['under_10', '10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s'])

lung_cancer.columns= lung_cancer.columns.str.lower()
lung_cancer.columns = lung_cancer.columns.str.replace(' ', '_')
lung_cancer.rename(columns = {"gender": "sex"},inplace = True)
lung_cancer.to_csv("final_lung_cancer.csv",index = False)
lung_cancer = pd.read_csv("final_lung_cancer.csv")
# X and y creation
X = lung_cancer
y = lung_cancer.pop("lung_cancer")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train.head()

num_cols = ["age"]

cat_cols = (
    X_train
    .drop(columns=num_cols)
    .columns)

# nominal pipeline
cat_pipe = Pipeline(steps=[
    #('imputer', SimpleImputer(fill_value="missing")),
    ('onehot', OneHotEncoder(handle_unknown="ignore", drop="first"))
])

# numeric pipeline
num_pipe = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', MinMaxScaler())
])

# preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
        ('nom', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
])

reg = LogisticRegression(solver="liblinear", C=10.0, max_iter=1000, random_state=0)

model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    #('var_threshold', VarianceThreshold()),
    #('mod_feature_sel', SelectFromModel(RandomForestRegressor())),
    # apply different models here
    ('regressor', reg)
])

model_pipe.fit(X_train, y_train)
joblib.dump(model_pipe, 'model_pipeline_lung_cancer.pkl')

['model_pipeline_lung_cancer.pkl']

# Heart data and its training

In [224]:
url_heartdisease = "https://drive.google.com/file/d/1ZKgoOAmtinjh6MCVjBvf3Q67ISrF5GBe/view?usp=share_link"
path_heartdisease = 'https://drive.google.com/uc?export=download&id='+url_heartdisease.split('/')[-2]
final_HeartDisease = pd.read_csv(path_heartdisease)
'''
# use get_dummies to convert the string column to int
sex = pd.get_dummies(df['Sex'], prefix="Sex", drop_first=True)
chestpaintype = pd.get_dummies(df['ChestPainType'], prefix="cpt", drop_first=True)
restingecg = pd.get_dummies(df['RestingECG'], prefix="ECG", drop_first=True)
exerciseangina = pd.get_dummies(df['ExerciseAngina'], prefix="angina", drop_first=True)
st_slope = pd.get_dummies(df['ST_Slope'], prefix="ST", drop_first=True)

# drop the columns applied dummies then use concat
df.drop(columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], inplace=True)
df = pd.concat([df, sex, chestpaintype, restingecg, exerciseangina, st_slope], axis=1)

'''
# X and y creation

final_HeartDisease.rename(columns = {"Age": "age", "Sex": "sex"}, inplace = True)

final_HeartDisease.to_csv("final_HeartDisease.csv", index = False)
X = final_HeartDisease
y = final_HeartDisease.pop("HeartDisease")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train.head()

ord_cols = ["ST_Slope"]
qualities = ["Down", "Flat", "Up"]
ord_cols_categories = [qualities]
cat_cols = ["sex", "ChestPainType", "FastingBS", "RestingECG", "ExerciseAngina"]
num_cols = (X_train.drop(columns=ord_cols + cat_cols).columns)

ord_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_cols_categories,
                               handle_unknown="use_encoded_value",
                               unknown_value=-1))
])

cat_pipe = Pipeline(steps=[
    #('imputer', SimpleImputer(fill_value="missing")),
    ('onehot', OneHotEncoder(handle_unknown="ignore", drop="first"))
])

num_pipe = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(transformers=[
        ('ord', ord_pipe, ord_cols),
        ('nom', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
])

reg = LogisticRegression(solver="liblinear", C=10.0, max_iter=1000, random_state=0)

model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    #('var_threshold', VarianceThreshold()),
    #('mod_feature_sel', SelectFromModel(RandomForestRegressor())),
    # apply different models here
    ('regressor', reg)
])

model_pipe.fit(X_train, y_train)
joblib.dump(model_pipe, 'model_pipeline_HeartDisease.pkl')

['model_pipeline_HeartDisease.pkl']

# stroke data and its training

In [230]:
url_stroke = "https://drive.google.com/file/d/1Kl6cTtJpMSJHwthPc1wR-06D3paapavb/view?usp=share_link"
path_stroke = 'https://drive.google.com/uc?export=download&id='+url_stroke.split('/')[-2]
stroke = pd.read_csv(path_stroke)
# source: https://www.kaggle.com/datasets/jillanisofttech/brain-stroke-dataset

stroke['age'] = stroke['age'].astype(int)

# Convert values in each column
#stroke["gender"] = stroke["gender"].map({"Female":0, "Male":1})
#stroke["ever_married"] = stroke["ever_married"].map({"No":0, "Yes":1})
#stroke["Residence_type"] = stroke["Residence_type"].map({"Rural":0, "Urban":1})
#stroke["work_type"] = stroke["work_type"].map({"Private": 0, "Self-employed": 1, "children": 2, "Govt_job": 3})
#stroke["smoking_status"] = stroke["smoking_status"].map({"never smoked":0, "Unknown":0.5, "formerly smoked": 1, "smokes": 2})

stroke.columns= stroke.columns.str.capitalize()
stroke.rename(columns={'Gender':'sex', "Age": "age"}, inplace=True)
stroke.sex = stroke.sex.map({"Male": "M","Female": "F" })
stroke.to_csv("final_Stroke.csv", index = False)

# X and y creation
X = stroke
y = stroke.pop("Stroke")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

num_cols = ["age", "Avg_glucose_level", "Bmi"]

ord_cols = ["Smoking_status"]

qualities = ["never smoked", "formerly smoked", "Unknown", "smokes"]
ord_cols_categories = [qualities]

cat_cols = ["sex"]

cat_cols = (
    X_train
    .drop(columns=ord_cols + num_cols)
    .columns)

# ordinal pipeline
ord_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ord_cols_categories,
                               handle_unknown="use_encoded_value",
                               unknown_value=-1))
])

# nominal pipeline
cat_pipe = Pipeline(steps=[
    #('imputer', SimpleImputer(fill_value="missing")),
    ('onehot', OneHotEncoder(handle_unknown="ignore", drop="first"))
])

# numeric pipeline
num_pipe = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', MinMaxScaler())
])

# preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
        ('ord', ord_pipe, ord_cols),
        ('nom', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
])

reg = LogisticRegression(solver="liblinear", C=10.0, max_iter=1000, random_state=0)
model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    #('var_threshold', VarianceThreshold()),
    #('mod_feature_sel', SelectFromModel(RandomForestRegressor())),
    # apply different models here
    ('regressor', reg)
])

model_pipe.fit(X_train, y_train)
joblib.dump(model_pipe, 'model_pipeline_Stroke.pkl')

['model_pipeline_Stroke.pkl']