# Assignment
1. Define, design, and apply Crisp-DM methodology from Business to Decision.
2. Clarify the Business understanding phase in your project.
3. Specify the data preparation tasks and elaborate on their needs in your project.
4. Apply three Machine learning models. Elaborate on the mathematical requirements and explain each model.
5. Evaluate and validate the models using an appropriate measure of performance.
6. Deploy the best model and elaborate on the insights and findings of your projec

# Loading Libraries

In [1]:

from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import  GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from statsmodels.api import GLM, add_constant, families
from sklearn.svm import SVC
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Data loding and Identifying Input and Output

In [119]:
data = read_csv('dataset_CA/heart2.csv')
X = data.drop('HeartDisease', axis=1)  # input
y = data['HeartDisease']   # output
y.value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

# Data Preparation

In [120]:
def data_prep(X):
    OHE = OneHotEncoder(handle_unknown='ignore')

    data_sex_OHE = OHE.fit_transform(X[['Sex']])
    data_sex_DF = DataFrame(data_sex_OHE.toarray())
    data_sex_DF.columns = OHE.get_feature_names_out()

    data_ChestPainType_OHE = OHE.fit_transform(X[['ChestPainType']])
    data_ChestPainType_DF = DataFrame(data_ChestPainType_OHE.toarray())
    data_ChestPainType_DF.columns = OHE.get_feature_names_out()

    data_RestingECG_OHE = OHE.fit_transform(X[['RestingECG']])
    data_RestingECG_DF = DataFrame(data_RestingECG_OHE.toarray())
    data_RestingECG_DF.columns = OHE.get_feature_names_out()

    data_ExerciseAngina_OHE = OHE.fit_transform(X[['ExerciseAngina']])
    data_ExerciseAngina_DF = DataFrame(data_ExerciseAngina_OHE.toarray())
    data_ExerciseAngina_DF.columns = OHE.get_feature_names_out()

    data_ST_Slope_OHE = OHE.fit_transform(X[['ST_Slope']])
    data_ST_Slope_DF = DataFrame(data_ST_Slope_OHE.toarray())
    data_ST_Slope_DF.columns = OHE.get_feature_names_out()

    #***********************Merging multiple DataFrames***********************

    X_binary = concat([data_sex_DF, data_ChestPainType_DF, data_RestingECG_DF, data_ExerciseAngina_DF, data_ST_Slope_DF, X[['FastingBS']]], axis=1)
    X_scalable = X[['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']]  # Orginal numeric columns

    #***********************Applying StandardScaler***********************

    X_scaled = StandardScaler().fit_transform(X_scalable)
    X_scaled_DF = DataFrame(X_scaled)
    X_scaled_DF.columns = X_scalable.columns

    #***********************Applying MinMaxScaler***********************

    # X_scaled = MinMaxScaler().fit_transform(X_scalable)
    # X_scaled_DF = DataFrame(X_scaled)
    # X_scaled_DF.columns = X_scalable.columns

    X_PREP = concat([X_scalable, X_binary], axis=1)  # Prepared Data

    X = add_constant(X_PREP)  # Add Intercept

    # X = X_PREP
    return X

In [121]:
X = data_prep(X)

# Train-Test Splitting

In [122]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training and 20% test

# get_models() Defination

In [None]:
def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_linear'] = SVC(kernel='linear')
    models['svc_rbf'] = SVC()
    models['svc_sigmoid'] = SVC(kernel='sigmoid')
    models['svc_poly'] = SVC(kernel='poly')
    return models

# Model validation using Monte Calro sampling

In [2]:
def evaluate_model_mc(model, X, y, mc, split):
    accuracy = [] 
    for i in range(mc):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=i)  # split dataset
        m = model.fit(X_train, y_train)  # fit the model
        prediction = m.predict(X_test)  # prediction
        accuracy.append(accuracy_score(y_test, prediction))  # compute & append accuracy
        return mean(accuracy)
    
def model_score_mc(X, y, mc, split): # Evaluate the models and store results
    models = get_models()
    score = dict()
    for name, model in models.items():
        scores = evaluate_model_mc(model, X, y, mc, split)
        score[name] = scores
    return score

In [None]:
score = model_score_mc(X, y, 100, 0.2)

# Conclusion: SVM with linear kernel provides us the best classifier

# Best Model Deployment

In [None]:
best_model = SVC(kernel='linear')
best.fit(X, y)

# Prediction

In [None]:
input = X.tail(1)
prediction = best_model.predict(input)
print('Prediction', prediction)
y.tail(1)

# Model validation based on k-fold cross validation

In [None]:
def cross_evaluator(model, X, y, k_fold):
    scores = cross_val_score(model, X, y, cv=k_fold)
    return mean(scores)

In [63]:

gnb = GaussianNB().fit(X_train, y_train) # Train the model using the training sets

prediction = gnb.predict(X_test)  # Predict the response for test dataset
df = DataFrame({'Prediction': prediction, "Actual": y_test})
df
accuracy = accuracy_score(y_test, prediction)
recall = recall_score(y_test, prediction, average='micro')
[accuracy, recall]

[0.8695652173913043, 0.8695652173913043]