## Data Modeling
***

In [2]:
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, log_loss, roc_auc_score, make_scorer, fbeta_score
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, log_loss, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

df = pd.read_csv('data/Cardiovascular_Diseases_Risk_Prediction_Dataset.csv')
ordinal = ['General_Health', 'Checkup', 'Age_Category']
numeric = list(df.select_dtypes(exclude=object).columns)
categorical = list(df.select_dtypes(object).columns)
for i in ordinal:
    categorical.remove(i)
categorical.remove('Heart_Disease')

# Splitting the training from the validation data
# Making sure the split is stratefied given teh imbalance of our target variable
y = df['Heart_Disease']
X = df.drop('Heart_Disease',axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=12, stratify= y)

# Transforming the target variable into 1's and 0's
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


# Listing categories in order for each ordinal variable.
health = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
check = [
    'Never', '5 or more years ago', 'Within the past 5 years',
    'Within the past 2 years', 'Within the past year'
]
age = [
    '18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
    '60-64', '65-69', '70-74', '75-79', '80+'
]

# Instantiating an OrdinalEncoder transformer to encode ordinal variables. 
oe = OrdinalEncoder(categories=[health, check, age])


# Instantiating a OneHotEncoder transformer to be used on the categorical varaibles. 
ohe = OneHotEncoder()

# Creating a column transformer to be used in a pipeline
col_transformer = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories=[health, check, age]), ordinal),
    ('ohe', OneHotEncoder(), categorical)
], remainder="passthrough")

def model_scores(model_name, model, X_train, y_train, model_list= [], notes = ''):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)
    scoring = {'f2': make_scorer(fbeta_score, beta=2),
               'recall': 'recall',
               'accuracy':'accuracy',
               'f1':'f1',
               'roc_auc':'roc_auc',
               'precision': 'precision'}
    scores = cross_validate(model,
                           X_train,
                           y_train,
                           scoring = scoring,
                           cv=skf,
                           n_jobs=-1)
    time = round(scores['fit_time'].mean() + scores['score_time'].mean(),0)
    f2 = round(scores['test_f2'].mean(),4)*100
    recall = round(scores['test_recall'].mean(),4)*100
    accuracy = round(scores['test_accuracy'].mean(),4)*100
    f1 = round(scores['test_f1'].mean(),4)*100
    roc_auc = round(scores['test_roc_auc'].mean(),4)*100
    precision = round(scores['test_precision'].mean(),4)*100
    model_list.append([model_name, time, f2, recall, accuracy, f1, roc_auc, precision, notes])
    df = pd.DataFrame(model_list, columns=['model', 'time (in s)', 'f2', 'recall', 'accuracy', 'f1', 'roc_auc', 'precision', 'notes'])
    return model_list, df


### Dummy Model

In [9]:
pipe = Pipeline(steps=[('ct', col_transformer),
                      ('dm', DummyClassifier(strategy='most_frequent'))])
dummy_model = pipe.fit(X_train, y_train)
ml1, df1 = model_scores('DummyClassifier', dummy_model, X_train, y_train, model_list= [], notes= "Dummy Model")
df1

Unnamed: 0,model,time (in s),f2,recall,accuracy,f1,roc_auc,precision,notes
0,DummyClassifier,1.0,0.0,0.0,91.92,0.0,50.0,0.0,Dummy Model


### Baseline Model

In [10]:
pipe = Pipeline(steps=[('ct', col_transformer),
                       ('ss', StandardScaler()),
                      ('log_reg', LogisticRegression(solver='lbfgs', max_iter=1000, random_state=12))])
logreg_model = pipe.fit(X_train, y_train)
ml2, df2 = model_scores('LogisticRegression', logreg_model, X_train, y_train, model_list= ml1, notes= "Baseline Model")
df2

Unnamed: 0,model,time (in s),f2,recall,accuracy,f1,roc_auc,precision,notes
0,DummyClassifier,1.0,0.0,0.0,91.92,0.0,50.0,0.0,Dummy Model
1,LogisticRegression,2.0,7.5,6.18,91.94,11.03,83.41,51.39,Baseline Model


### Model Iteration


In [None]:
{'xbg__reg_lambda': 1, 'xbg__reg_alpha': 1, 'xbg__n_estimators': 250, 'xbg__min_child_weight': 7, 'xbg__max_depth': 3, 'xbg__gamma': 0.1, 'xbg__eta': 0.15, 'xbg__colsample_bytree': 0.26530612244897955}
{'xbg__reg_lambda': 10, 'xbg__reg_alpha': 0.01, 'xbg__n_estimators': 500, 'xbg__min_child_weight': 3, 'xbg__max_depth': 14, 'xbg__gamma': 0.1, 'xbg__eta': 0.01, 'xbg__colsample_bytree': 0.5306122448979591}
{'xbg__reg_lambda': 1, 'xbg__reg_alpha': 10, 'xbg__n_estimators': 650, 'xbg__min_child_weight': 9, 'xbg__max_depth': 3, 'xbg__gamma': 10, 'xbg__eta': 0.05, 'xbg__colsample_bytree': 0.8775510204081632}
{'xbg__reg_lambda': 0, 'xbg__reg_alpha': 0.01, 'xbg__n_estimators': 800, 'xbg__min_child_weight': 4, 'xbg__max_depth': 1, 'xbg__gamma': 10, 'xbg__eta': 0.001, 'xbg__colsample_bytree': 0.8163265306122448}
{'xbg__reg_lambda': 1, 'xbg__reg_alpha': 0, 'xbg__n_estimators': 550, 'xbg__min_child_weight': 8, 'xbg__max_depth': 9, 'xbg__gamma': 0, 'xbg__eta': 0.3, 'xbg__colsample_bytree': 0.5102040816326531}
{'xbg__reg_lambda': 1, 'xbg__reg_alpha': 0, 'xbg__n_estimators': 550, 'xbg__min_child_weight': 8, 'xbg__max_depth': 9, 'xbg__gamma': 0, 'xbg__eta': 0.3, 'xbg__colsample_bytree': 0.5102040816326531}
{'xbg__reg_lambda': 1, 'xbg__reg_alpha': 1, 'xbg__n_estimators': 250, 'xbg__min_child_weight': 7, 'xbg__max_depth': 3, 'xbg__gamma': 0.1, 'xbg__eta': 0.15, 'xbg__colsample_bytree': 0.26530612244897955}
{'xbg__reg_lambda': 1, 'xbg__reg_alpha': 0, 'xbg__n_estimators': 550, 'xbg__min_child_weight': 8, 'xbg__max_depth': 9, 'xbg__gamma': 0, 'xbg__eta': 0.3, 'xbg__colsample_bytree': 0.5102040816326531}

### Final Model
I decided to have my final model be the step where `greenbelt` was dropped (index = 8) because the the Adjusted R-Squared was high, the Mean Absolute Error was relatively low, and the Conditional Number didn't seem to improve much as the iteration continued. 

**Target Variable:** price <br>
**Predictor Variables:** sqft_living_norm, waterfront, and 65 different zip codes.

In [None]:
# This function does the same thing as baseline, but for the final model.
final_results, final_df = dm.final_res(df)
final_df

# I still wonder if 68 features is too many, perhaps this is why the condition 
# number is still so large. 