# Assignment
1. Define, design, and apply Crisp-DM methodology from Business to Decision.
2. Clarify the Business understanding phase in your project.
3. Specify the data preparation tasks and elaborate on their needs in your project.
4. Apply three Machine learning models. Elaborate on the mathematical requirements and explain each model.
5. Evaluate and validate the models using an appropriate measure of performance.
6. Deploy the best model and elaborate on the insights and findings of your projec

# Loading Libraries

In [1]:

from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import  GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from statsmodels.api import GLM, add_constant, families
from sklearn.svm import SVC
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Data loding and Identifying Input and Output

In [119]:
data = read_csv('dataset_CA/heart2.csv')
X = data.drop('HeartDisease', axis=1)  # input
y = data['HeartDisease']   # output
y.value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

# Data Preparation

In [120]:
def data_prep(X):
    OHE = OneHotEncoder(handle_unknown='ignore')

    data_sex_OHE = OHE.fit_transform(X[['Sex']])
    data_sex_DF = DataFrame(data_sex_OHE.toarray())
    data_sex_DF.columns = OHE.get_feature_names_out()

    data_ChestPainType_OHE = OHE.fit_transform(X[['ChestPainType']])
    data_ChestPainType_DF = DataFrame(data_ChestPainType_OHE.toarray())
    data_ChestPainType_DF.columns = OHE.get_feature_names_out()

    data_RestingECG_OHE = OHE.fit_transform(X[['RestingECG']])
    data_RestingECG_DF = DataFrame(data_RestingECG_OHE.toarray())
    data_RestingECG_DF.columns = OHE.get_feature_names_out()

    data_ExerciseAngina_OHE = OHE.fit_transform(X[['ExerciseAngina']])
    data_ExerciseAngina_DF = DataFrame(data_ExerciseAngina_OHE.toarray())
    data_ExerciseAngina_DF.columns = OHE.get_feature_names_out()

    data_ST_Slope_OHE = OHE.fit_transform(X[['ST_Slope']])
    data_ST_Slope_DF = DataFrame(data_ST_Slope_OHE.toarray())
    data_ST_Slope_DF.columns = OHE.get_feature_names_out()

    #***********************Merging multiple DataFrames***********************

    X_binary = concat([data_sex_DF, data_ChestPainType_DF, data_RestingECG_DF, data_ExerciseAngina_DF, data_ST_Slope_DF, X[['FastingBS']]], axis=1)
    X_scalable = X[['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']]  # Orginal numeric columns

    #***********************Applying StandardScaler***********************

    X_scaled = StandardScaler().fit_transform(X_scalable)
    X_scaled_DF = DataFrame(X_scaled)
    X_scaled_DF.columns = X_scalable.columns

    #***********************Applying MinMaxScaler***********************

    # X_scaled = MinMaxScaler().fit_transform(X_scalable)
    # X_scaled_DF = DataFrame(X_scaled)
    # X_scaled_DF.columns = X_scalable.columns

    X_PREP = concat([X_scalable, X_binary], axis=1)  # Prepared Data

    X = add_constant(X_PREP)  # Add Intercept

    # X = X_PREP
    return X

In [121]:
X = data_prep(X)

# Train-Test Splitting

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training and 20% test

# get_models() Defination

In [None]:
def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

# Model validation using Monte Calro sampling

In [None]:
def evaluate_model_mc(model, X, y, mc, split):
    accuracy = [] 
    for i in range(mc):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)  # split dataset
        m = model.fit(X_train, y_train)  # fit the model
        prediction = m.predict(X_test)  # prediction
        accuracy.append(accuracy_score(y_test, prediction))  # compute & append accuracy
        return mean(accuracy)
    
def model_score_mc(file_url, y_value, mc, split):
    dataset = read_csv(file_url)
    X = dataset.drop(y_value, axis=1)  # matrix of input variables
    y = dataset[y_value]  # output variable
    models = get_models()
    # evaluate the models and store results
    results, names = list(), list()
    output = dict()
    for name, model in models.items():
        scores = evaluate_model_mc(model, X, y, mc, split)
        output[name] = scores
    return output

# Model validation based on k-fold cross validation

In [None]:
def cross_evaluator(model, X, y, k_fold):
    scores = cross_val_score(model, X, y, cv=k_fold)
    return mean(scores)

In [63]:

gnb = GaussianNB().fit(X_train, y_train) # Train the model using the training sets

prediction = gnb.predict(X_test)  # Predict the response for test dataset
df = DataFrame({'Prediction': prediction, "Actual": y_test})
df
accuracy = accuracy_score(y_test, prediction)
recall = recall_score(y_test, prediction, average='micro')
[accuracy, recall]

[0.8695652173913043, 0.8695652173913043]

# Model validation using Monte Calro sampling

In [46]:
accuracy = []
for r in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    gnb = GaussianNB().fit(X_train, y_train)

    prediction = gnb.predict(X_test)  # Predict the response for test dataset
    accuracy.append(accuracy_score(y_test, prediction))
mean(accuracy)

0.8559782608695653

# Model validation based on Cross Validation score

In [47]:
gnb = GaussianNB()
scores = cross_val_score(gnb, X, y, cv=2)
mean(scores)

0.835511982570806

# Prediction: for the last sample in the dataset

In [53]:
X_input = X.tail(1)
gnb.fit(X, y)
print(y.tail(1))
gnb.predict(X_input)

917    0
Name: HeartDisease, dtype: int64


array([0], dtype=int64)

# Example 3: Use iris dataset
1.  Fit the Naive Bayes classifier  and evaluate the accuracy of the model in 100 mc runs. use 80% as the trainset and 20% testset.
2.  Recommend the type of flower for the following sample: X=[4,1,2,3].

In [77]:
from pandas import read_csv
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score
from numpy import mean

In [78]:
dataset = read_csv('dataset_CA/c2k_data_comma.csv')
X = data.drop('legs', axis=1)  # input
y = data['legs']   # output
# y.value_counts()

In [79]:
gnb = GaussianNB()
accuracy = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    gnb.fit(X_train, y_train)
    prediction = gnb.predict(X_test)
    accuracy.append(accuracy_score(prediction, y_test))
valid_acc = mean(accuracy)
valid_acc

ValueError: could not convert string to float: '?'

# 3-fold cross-validtion 
Validate the accuracy of GNB using 3 fold cross validation technique. 

In [5]:
cv = cross_val_score(gnb, X, y, cv=50)
mean(cv)

0.9533333333333335

# Model Deployment

In [6]:
model_gnb = gnb.fit(X.values, y.values) # Warning resoved
input = [[4, 1, 2, 3]]
model_gnb.predict(input)

array(['Iris-virginica'], dtype='<U15')

# Comparing with Logistic Regression

In [7]:
model_logistic = LogisticRegression(max_iter=10000) # Warning resolved
accuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    model_logistic.fit(X_train, y_train)
    prediction = model_logistic.predict(X_test)
    accuracy.append(accuracy_score(prediction, y_test))
valid_acc = mean(accuracy)
valid_acc

0.96

Example: predict **Species** in the **Iris** dataset using Support Vector Machine. 
*   train the model using 80% and evaluate it based on 20% testset
*   apply svm for different kernels
*   validate the result in 100 MC runs
*   Recommend the type of flower for the first sample in the dataset using the best classifier 

In [56]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from numpy import mean

In [57]:
dataset = read_csv('dataset_CA/StudentsPerformanceEvaluation.csv')
X = dataset.drop(['GRADE', 'STUDENT ID'], axis=1)  # Input variables
y = dataset['GRADE']  # Output variable

# svm for different kernels

In [58]:
accuracy = []
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # testset is 30%
    model = SVC(kernel=i)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, prediction))
accuracy

[0.20454545454545456,
 0.3409090909090909,
 0.22727272727272727,
 0.20454545454545456]

# Validate the result of performance for different kernels of svc in 100 MC runs

In [11]:
accuracy = []
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    accuracy_mc = []
    for j in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=j)  # testset is 30%
        model = SVC(kernel=i, C=1.1)
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        accuracy_mc.append(accuracy_score(y_test, prediction))  # append accuracy score in each MC run
    accuracy.append(mean(accuracy_mc)) # mean of accuracy and append it in accuracy in kernel array
accuracy

[0.9740000000000002, 0.964, 0.9593333333333333, 0.2515555555555555]

# 10-fold cross validation score 

In [12]:
accuracy = []
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    model = SVC(kernel=i)
    cross_v_score = cross_val_score(model, X, y, scoring='accuracy', cv=10) # cv=10: 10-fold cross validation
    # mean of accuracy and append it in accuracy in kernel array
    accuracy.append(mean(cross_v_score))
accuracy

[0.9733333333333334,
 0.9666666666666668,
 0.9733333333333334,
 0.06666666666666668]

# Repeat the above expriment for NB classifier and logistic regression 

In [13]:
accuracy_svc = []
accuracy_gnb = []
accuracy_lr = []

model_lr = LogisticRegression(max_iter=10000)
model_gnb = GaussianNB()

cross_v_score_lr = cross_val_score(model_lr, X, y, scoring='accuracy', cv=6)
cross_v_score_gnb = cross_val_score(gnb, X, y, scoring='accuracy', cv=6)
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    model_svc = SVC(kernel=i)
    cross_v_score_svc = cross_val_score(model_svc, X, y, scoring='accuracy', cv=6)
    accuracy_svc.append(mean(cross_v_score_svc)) # mean of accuracy and append it in accuracy in kernel array
accuracy = [accuracy_svc, mean(cross_v_score_lr), mean(cross_v_score_gnb)]
accuracy

[[0.98, 0.96, 0.98, 0.16666666666666666],
 0.9666666666666667,
 0.9533333333333333]

# Use Gini index to specify the root node of the decision tree for playtennis dataset. 

# Example 1: 
use decision tree algorithm with gini and entropy rules to predict the species of flowers in the Iris dataset, and compare the result versus SVC and Multinomial logistic regression in 1000 mc runs. 

In [20]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier # decision tee algorithm for classification
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC

In [21]:
dataset = read_csv('dataset/Iris.csv')
X = dataset.drop('Species', axis=1)  # matrix of input variables
y = dataset['Species']  # output variable

In [23]:
dt = DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # split dataset
model_dt = dt.fit(X_train, y_train)  # fit the model
prediction = model_dt.predict(X_test)  # prediction
recall = recall_score(y_test, prediction, average='weighted')  # compute recall
recall
classification_report(y_test, prediction)


'                 precision    recall  f1-score   support\n\n    Iris-setosa       1.00      1.00      1.00        11\nIris-versicolor       0.89      1.00      0.94         8\n Iris-virginica       1.00      0.91      0.95        11\n\n       accuracy                           0.97        30\n      macro avg       0.96      0.97      0.96        30\n   weighted avg       0.97      0.97      0.97        30\n'

# Dictionary of Models

In [32]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

In [None]:
dataset = read_csv('dataset/Iris.csv')
X = dataset.drop('Species', axis=1)  # matrix of input variables
y = dataset['Species']  # output variable

In [33]:
def evaluate_model(model, X, y, mc_run, split):
    accuracy = [] 
    for i in range(mc_run):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)  # split dataset
        m = model.fit(X_train, y_train)  # fit the model
        prediction = m.predict(X_test)  # prediction
        accuracy.append(accuracy_score(y_test, prediction))  # compute & append accuracy
        return mean(accuracy)

In [40]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y, 100, 0.2)
    results.append(scores)
    names.append(name)
    print((name, mean(scores)))


('dt_ent', 0.9666666666666667)
('dt_gini', 0.9666666666666667)
('lr', 0.9666666666666667)
('svc_l', 1.0)
('svc_r', 0.9333333333333333)
('svc_s', 0.2)
('svc_p', 1.0)


# Evaluate the models in dictionary using k-fold cross validation

# Import Libraries

In [46]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# get_models defination

In [47]:
def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

In [48]:
def cross_evaluator(model, X, y, k_fold):
    scores = cross_val_score(model, X, y, cv=k_fold)
    return mean(scores)

# Evaluation

In [49]:
models = get_models() # Get the model list to be evaluated
results, names = list(), list()
print('The following result is the accuracy of different model in the dictionary of SML based on 6-fold cross validation')
for name, model in models.items():
    scores = cross_evaluator(model, X, y, 6)
    results.append(scores)
    names.append(name)
    print((name, mean(scores)))
# plot model performance for comparison


The following result is the accuracy of different model in the dictionary of SML based on 6-fold cross validation
('dt_ent', 0.9533333333333333)
('dt_gini', 0.96)
('lr', 0.9666666666666667)
('svc_l', 0.98)
('svc_r', 0.98)
('svc_s', 0.16666666666666666)
('svc_p', 0.96)


# Working example for classification:
1. create a dictionary of models using svc, logistic regression, GuassianNB and Decision tree with Entropy kernel. 
2. evaluate the dictionary of models using recall and validate the result in 5-fold cross validation. 
3. deploy the best model to classify the last sample in the original dataset

In [None]:
# Import Libraries
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Model Defination
def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

In [None]:
# data preprocesser
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)  # testset is 10%


def data_prep(X_train, X_test, y_train):

    # fit and apply the transform
    X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
    scalar = MinMaxScaler()
    N = scalar.fit_transform(X_train_under)
    X_strain = DataFrame(N)
    header = X_train.columns  # header from the original dataset
    X_strain.columns = header
    #############################
    N = scalar.fit_transform(X_test)
    X_stest = DataFrame(N)
    header = X.columns  # header from the original dataset
    X_stest.columns = header
    # undersampler

    return X_strain, X_stest, y_train_under
