# Assignment
1. Define, design, and apply Crisp-DM methodology from Business to Decision.
2. Clarify the Business understanding phase in your project.
3. Specify the data preparation tasks and elaborate on their needs in your project.
4. Apply three Machine learning models. Elaborate on the mathematical requirements and explain each model.
5. Evaluate and validate the models using an appropriate measure of performance.
6. Deploy the best model and elaborate on the insights and findings of your projec

# Loading Libraries

In [24]:

from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import  GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from statsmodels.api import GLM, add_constant, families
from sklearn.svm import SVC
from numpy import mean, std
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Data loding and Identifying Input and Output

In [32]:
data = read_csv('dataset_CA/heart2.csv')
X = data.drop('HeartDisease', axis=1)  # input
y = data['HeartDisease']   # output
print(y.value_counts())
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

print(Counter(y_over))

1    508
0    410
Name: HeartDisease, dtype: int64
Counter({0: 508, 1: 508})


# Data Preparation

In [33]:
def data_prep(X_over):
    OHE = OneHotEncoder(handle_unknown='ignore')

    data_sex_OHE = OHE.fit_transform(X_over[['Sex']])
    data_sex_DF = DataFrame(data_sex_OHE.toarray())
    data_sex_DF.columns = OHE.get_feature_names_out()

    data_ChestPainType_OHE = OHE.fit_transform(X_over[['ChestPainType']])
    data_ChestPainType_DF = DataFrame(data_ChestPainType_OHE.toarray())
    data_ChestPainType_DF.columns = OHE.get_feature_names_out()

    data_RestingECG_OHE = OHE.fit_transform(X_over[['RestingECG']])
    data_RestingECG_DF = DataFrame(data_RestingECG_OHE.toarray())
    data_RestingECG_DF.columns = OHE.get_feature_names_out()

    data_ExerciseAngina_OHE = OHE.fit_transform(X_over[['ExerciseAngina']])
    data_ExerciseAngina_DF = DataFrame(data_ExerciseAngina_OHE.toarray())
    data_ExerciseAngina_DF.columns = OHE.get_feature_names_out()

    data_ST_Slope_OHE = OHE.fit_transform(X_over[['ST_Slope']])
    data_ST_Slope_DF = DataFrame(data_ST_Slope_OHE.toarray())
    data_ST_Slope_DF.columns = OHE.get_feature_names_out()

    #***********************Merging multiple DataFrames***********************

    X_binary = concat([data_sex_DF, data_ChestPainType_DF, data_RestingECG_DF, data_ExerciseAngina_DF, data_ST_Slope_DF, X_over[['FastingBS']]], axis=1)
    X_scalable = X_over[['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']]  # Orginal numeric columns

    #***********************Applying StandardScaler***********************

    # X_scaled = StandardScaler().fit_transform(X_scalable)
    # X_scaled_DF = DataFrame(X_scaled)
    # X_scaled_DF.columns = X_scalable.columns

    #***********************Applying MinMaxScaler***********************

    X_scaled = MinMaxScaler().fit_transform(X_scalable)
    X_scaled_DF = DataFrame(X_scaled)
    X_scaled_DF.columns = X_scalable.columns

    X_PREP = concat([X_scalable, X_binary], axis=1)  # Prepared Data

    X_over = add_constant(X_PREP)  # Add Intercept

    # X = X_PREP
    return X_over

In [34]:
X_over = data_prep(X_over)

# Train-Test Splitting

In [122]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training and 20% test

# get_base_models() Defination

In [35]:
def get_base_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_linear'] = SVC(kernel='linear')
    models['svc_rbf'] = SVC()
    models['svc_sigmoid'] = SVC(kernel='sigmoid')
    models['svc_poly'] = SVC(kernel='poly')
    return models

# Model validation based on k-fold cross validation

In [36]:
# Evaluate models by cross validation score
def evaluate_model_by_cv(X_over, y_over):
    models = get_base_models()
    score = dict()
    for name, model in models.items():
        scores = cross_val_score(model, X_over, y_over, scoring="recall_weighted")
        score[name] = scores
    return score

score = evaluate_model_by_cv(X_over, y_over)
print('*********Cross Validation Score for each Model*********')
for item in score:
    print('>Model: %s, Mean Score: %.3f, Standard Deviation: %.3f' % (item, mean(score[item]), std(score[item])))

*********Cross Validation Score for each Model*********
>Model: dt_ent, Mean Score: 0.809, Standard Deviation: 0.019
>Model: lr, Mean Score: 0.821, Standard Deviation: 0.037


# Conclusion: SVM with linear kernel provides us the best classifier

# Best Model Deployment

In [None]:
best_model = BaggingClassifier(estimator=DecisionTreeClassifier(criterion='entropy'), n_estimators=50, max_samples=0.8, max_features=0.8)
best_model.fit(X, y)

# Prediction

In [None]:
input = X.tail(1)
prediction = best_model.predict(input)
print('Prediction', prediction)
y.tail(1)