# Example
use diamonds dataset to predict **price** in terms of other variables. For this, 
a. estimate the parameters
b. select the imporant input features 
c. write down the predictive model 

since price is practically continous therefore we need to apply linear regression:
to do the following tasks:
1.   parameter learning
2.   feature selection
3.   predictive model (use steps 1 and 2 and link function)
4.   prediction 
5.   Evaluate the performance of prediction (split the dataset into trainset and testset)

We can do Linear regression using 2 different ways:
1.  using statistical learning
2.  using sklearn

data modeling:
1. import the function of the model
2. create the model
3.  fit the model
4. prediction using the model

# Statistical way to recall GLM

In [114]:
#Import neccessary library
from statistics import mean
from statsmodels.api import GLM, add_constant, families
from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [116]:
data = read_csv('dataset/diamonds.csv')

X = data.drop(['price', 'Unnamed: 0'], axis=1)  # Input variables
y = data['price']  # Output variable

OE = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']])

data_cut_OE = OE.fit_transform(X[['cut']])
data_cut_DF = DataFrame(data_cut_OE)
data_cut_DF.columns = ['encoded cut']

OHE = OneHotEncoder(handle_unknown='ignore')

data_color_OHE = OHE.fit_transform(X[['color']])
data_color_DF = DataFrame(data_color_OHE.toarray())
data_color_DF.columns = OHE.get_feature_names_out()

data_clarity_OHE = OHE.fit_transform(X[['clarity']])
data_clarity_DF = DataFrame(data_clarity_OHE.toarray())
data_clarity_DF.columns = OHE.get_feature_names_out()

# Merging multiple DataFrames

In [117]:
X_binary = concat([data_color_DF, data_clarity_DF], axis=1) # Excluding data_cut_DF since it is ordinal
X_scalable = X[['carat', 'depth', 'table', 'length_mm', 'width_mm', 'depth_mm']]  # Orginal numeric columns
X_scalable = concat([X_scalable, data_cut_DF], axis=1)  # Including encoded ordinal columns

# Applying StandardScaler

In [118]:
X_scaled = StandardScaler().fit_transform(X_scalable)
X_scaled_DF = DataFrame(X_scaled)
X_scaled_DF.columns = X_scalable.columns

In [119]:
X_PREP = concat([X_scaled_DF, X_binary], axis=1)  # Prepared Data
X_PREP

Unnamed: 0,carat,depth,table,length_mm,width_mm,depth_mm,encoded cut,color_D,color_E,color_F,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-2.206418,-0.107785,-1.142937,-2.460814,-2.455435,-2.465387,0.225867,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-2.305673,-1.092137,1.333186,-2.552417,-2.673932,-2.761965,1.099829,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-2.206418,-2.771325,2.983935,-2.308143,-2.314972,-2.761965,-1.522057,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-1.908655,0.413343,0.095125,-2.079136,-2.065261,-1.971090,1.099829,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.809400,0.934470,0.095125,-1.865396,-1.877977,-1.674511,-1.522057,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.225316,-0.107785,-0.730250,0.241466,0.369425,0.277963,0.225867,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
855,0.721588,-1.265846,-0.317563,0.852150,0.837634,0.500397,0.225867,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
856,0.274943,-0.223591,0.507812,0.378870,0.353818,0.302678,1.099829,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
857,1.813387,-0.049882,1.333186,1.386500,1.399485,1.365417,1.099829,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Modelling for feature selection / reduced model

In [119]:
X = add_constant(X_PREP)  # Ading constant column to handle bias; 
model_full = GLM(y, X, family=families.Gaussian(families.links.log())).fit()

In [120]:
pvalues = dict(model_full.pvalues)  # Identifying the columns to drop
drop_columns = []
for x in pvalues:
    if pvalues[x] >= 0.05:
        drop_columns.append(x)
# print(drop_columns)
X_red = X.drop(drop_columns, axis=1)

Predictive Model:  yhat=5.455245+0.001909*ZN+0.022373*INDUS+0.028275*RAD-0.002238*MEDV    

In [121]:
rmse_full = []
rmse_red = []
for i in range(20):
    X = add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    #*************Full Data Modelling*************
    
    # Creating the Linear Regression Model
    model_full = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()

    # Predicting using Test-set
    pred_full = model_full.predict(X_test)
    rmse_full.append(sqrt(mean_squared_error(y_test, pred_full)))
    
    #*************Reduced Data Modelling*************
    
    # Creating the Linear Regression Model
    X = add_constant(X_red)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model_reduced = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()
    
    # Predicting using Test-set
    pred_red = model_reduced.predict(X_test)
    rmse_red.append(sqrt(mean_squared_error(y_test, pred_red)))

[mean(rmse_full), mean(rmse_red)]

[225.3099879525569, 228.72941658063053]

# Deployment phase 

In [122]:
X = add_constant(X_red)
model_dep = GLM(y, X, family=families.Gaussian(families.links.log())).fit()  # create the linear reg model using

# Predicting using testset

In [123]:
new = X.tail(1)
pred_red = model_dep.predict(new)
print(pred_red)
y.tail(1)

858    2824.675874
dtype: float64


858    2871
Name: price, dtype: int64

# Statistical & sklearn way to recall GLM

In [1]:
# Import neccessary library
from statistics import mean
from statsmodels.api import GLM, add_constant, families
from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [3]:
data = read_csv('dataset/diamonds.csv')

X = data.drop(['price', 'Unnamed: 0'], axis=1)  # Input variables
y = data['price']  # Output variable

OE = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']])
data_cut_OE = OE.fit_transform(X[['cut']])
data_cut_DF = DataFrame(data_cut_OE)
data_cut_DF.columns = ['encoded cut']

OHE = OneHotEncoder(handle_unknown='ignore')

data_color_OHE = OHE.fit_transform(X[['color']])
data_color_DF = DataFrame(data_color_OHE.toarray())
data_color_DF.columns = OHE.get_feature_names_out()

data_clarity_OHE = OHE.fit_transform(X[['clarity']])
data_clarity_DF = DataFrame(data_clarity_OHE.toarray())
data_clarity_DF.columns = OHE.get_feature_names_out()

['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'length_mm', 'width_mm', 'depth_mm']
<class 'pandas.core.indexes.base.Index'>


# Merging multiple DataFrames

In [126]:
X_binary = concat([data_color_DF, data_clarity_DF], axis=1) # Excluding data_cut_DF since it is ordinal
X_scalable = X[['carat', 'depth', 'table', 'length_mm', 'width_mm', 'depth_mm']]  # Orginal numeric columns
X_scalable = concat([X_scalable, data_cut_DF], axis=1) # Including encoded ordinal columns

# Applying standard scalar

In [127]:
X_scaled = StandardScaler().fit_transform(X_scalable)
X_scaled_DF = DataFrame(X_scaled)
X_scaled_DF.columns = X_scalable.columns

In [128]:
X_PREP = concat([X_scaled_DF, X_binary], axis=1)  # Prepared Data

# Data Modelling for feature selection / reduced model

In [129]:
X = add_constant(X_PREP)  # Ading constant column to handle bias
model_full = GLM(y, X, family=families.Gaussian(families.links.log())).fit()

In [130]:
pvalues = dict(model_full.pvalues)  # Identifying the columns to drop
drop_columns = []
for x in pvalues:
    if pvalues[x] >= 0.05:
        drop_columns.append(x)
# print(drop_columns)
X_red = X.drop(drop_columns, axis=1)

Predictive Model:  yhat=5.455245+0.001909*ZN+0.022373*INDUS+0.028275*RAD-0.002238*MEDV

In [131]:
rmse_st_full = []
rmse_st_red = []
rmse_skl_full = []
rmse_skl_red = []

model_skl = LinearRegression()
for i in range(20):
    #************************* Full Data Modelling  ***************************#
    X = add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # statsmodels
    model_st_full = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()

    # Predicting using Test-set
    pred_st_full = model_st_full.predict(X_test)
    rmse_st_full.append(sqrt(mean_squared_error(y_test, pred_st_full)))
    
    # sklearn
    model_skl.fit(X_train, y_train)
    
    # Predicting using Test-set
    pred_skl_full = model_skl.predict(X_test)
    rmse_skl_full.append(sqrt(mean_squared_error(y_test, pred_skl_full)))
    
    #************************* Reduced Data Modelling  ***************************#
    X = add_constant(X_red)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # statsmodels
    model_st_red = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()  # create the linear reg model using
    
    # Predicting using Test-set
    pred_red = model_reduced.predict(X_test)
    rmse_st_red.append(sqrt(mean_squared_error(y_test, pred_red)))
    
    # sklearn
    model_skl.fit(X_train, y_train)
    
    # Predicting using Test-set
    pred_skl_red = model_skl.predict(X_test)
    rmse_skl_red.append(sqrt(mean_squared_error(y_test, pred_skl_red)))
    

[mean(rmse_st_full), mean(rmse_st_red), mean(rmse_skl_full), mean(rmse_skl_red)]

[231.3826222352573, 221.00936319241126, 164.57963498013402, 158.71981793809638]

# Deployment Phase

In [132]:
X = add_constant(X_red)
model_skl_red_dep = model_skl.fit(X, y)

# Predicting using Test-set
# Predict the tax using the linear regression for the last row in the boston_house-prices dataset

In [133]:
prediction = model_skl_red_dep.predict(X.tail(1))
print(prediction)
y.tail(1)

[2809.03468253]


858    2871
Name: price, dtype: int64

# Logistic regression
Use boston_house price dataset, 
1.   Consider CHAS as the output variable 
2.   the remaining variables are inputs 
apply logistic regression to do the following tasks:
*  parameter estimation 
*   attribute selection 
*  predictive model
*   prediction

In [94]:
# Import neccessary library
from statistics import mean
from statsmodels.api import GLM, add_constant, families
from pandas import read_csv, DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import accuracy_score, recall_score

# Data Preparation

In [95]:
data = read_csv('dataset_CA/heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [96]:
data['output'].unique()

array([1, 0], dtype=int64)

In [97]:
X = data.drop('output', axis=1)  # input
y = data['output']   # output

# Data Modelling for feature selection / reduced model

# Logistic regression using statsmodels

In [98]:
X = add_constant(X)  # add intercept
model_st = GLM(y, X, family=families.Binomial()).fit()

In [99]:
pvalues = dict(model_st.pvalues)  # Identifying the columns to drop
drop_columns = []
for x in pvalues:
    if pvalues[x] >= 0.05:
        drop_columns.append(x)
print("Columns to be romoved:", drop_columns)

Columns to be romoved: ['const', 'age', 'trtbps', 'chol', 'fbs', 'restecg', 'slp']


In [100]:
X_red = X.drop(drop_columns, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)  # testset is 10%

Predictive model based on above results:

that_i = 0.0952 * INDUS_i + 0.2191 * RAD_i - 0.0087 * TAX_i + 0.0693 * MEDV_i

1.   phat_i = 1 / (1 + exp(-that_i))
2.   yhat_i = 1 if phat_i >= 0.5 else yhat_i = 0

In [101]:
model_st_red = GLM(y_train, X_train, family=families.Binomial()).fit() # logistic regression

In [102]:
prediction_st_raw = model_st_red.predict(X_test)

# Converting predictions to binary predicted values

In [103]:
prediction_st = [1 if prediction > 0.5 else 0 for prediction in prediction_st_raw]
df = DataFrame({"Prediction": prediction_st, "Actual": y_test})
df.head()


Unnamed: 0,Prediction,Actual
245,0,0
162,1,1
10,1,1
161,1,1
73,1,1


# Logistic regression using sklearn

In [104]:
model_skl = LogisticRegression(max_iter=10000).fit(X_train, y_train)
prediction_skl = model_skl.predict(X_test)

In [105]:
df = DataFrame({'Prediction': prediction_skl, "Actual": y_test})
df

Unnamed: 0,Prediction,Actual
245,0,0
162,1,1
10,1,1
161,1,1
73,1,1
16,1,1
180,0,0
195,0,0
200,1,0
47,1,1


# Compute recall and accuracy

In [106]:
# Prediction_st
acc_st = accuracy_score(y_test, prediction_st)
re_st = recall_score(y_test, prediction_st)
acc_skl = accuracy_score(y_test, prediction_skl)
re_skl = recall_score(y_test, prediction_skl)
print([acc_st, acc_skl])
[re_st, re_skl]


[0.8709677419354839, 0.8709677419354839]


[0.9473684210526315, 0.9473684210526315]

#Example

Use **winequality-red** dataset, apply NB classifier to predict **quality** variable. to do so, 

1. split the dataset into 70/30%

2. evaluate the model using **accurary** and **recall**.

3. validate the result using MC sampling with mc=100. 

4. predict the quality for a give set of input variables. 

5. redo the classification task using logistic regression and compare the result versus Naive Bayes classifier in 100 MC runs. Specify the best classifier

In [1]:
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import  GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score
from numpy import mean

In [2]:
dataset = pd.read_csv('dataset_CA/Customers.csv')
X = data.drop('output', axis=1)  # input
y = data['output']   # output
y.value_counts()

NameError: name 'data' is not defined

# Model evaluation

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 70% training and 30% testgnb = GaussianNB()
gnb = GaussianNB().fit(X_train, y_train) # Train the model using the training sets

prediction = gnb.predict(X_test)  # Predict the response for test dataset
df = DataFrame({'Prediction': prediction, "Actual": y_test})
df
# accuracy = accuracy_score(y_test, prediction)
# recall = recall_score(y_test, prediction, average='micro')
# [accuracy, recall]

Unnamed: 0,Prediction,Actual
105,1,1
154,1,1
133,1,1
223,0,0
198,0,0
...,...,...
82,1,1
108,1,1
249,0,0
176,0,0


# Model validation using Monte Calro sampling

In [110]:
accuracy = []
for r in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    gnb = GaussianNB().fit(X_train, y_train)

    prediction = gnb.predict(X_test)  # Predict the response for test dataset
    accuracy.append(accuracy_score(y_test, prediction))
mean(accuracy)

0.8168791208791211

# Model validation based on Cross Validation score

In [111]:
gnb = GaussianNB()
scores = cross_val_score(gnb, X, y, cv=2)
mean(scores)

0.8118464621819449

# Prediction: for the last sample in the dataset

In [112]:
X_input = X.tail(1)
gnb.fit(X, y)
gnb.predict(X_input)

array([1], dtype=int64)

# Example 3: Use iris dataset
1.  Fit the Naive Bayes classifier  and evaluate the accuracy of the model in 100 mc runs. use 80% as the trainset and 20% testset.
2.  Recommend the type of flower for the following sample: X=[4,1,2,3].

In [77]:
from pandas import read_csv
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score
from numpy import mean

In [78]:
dataset = read_csv('dataset_CA/c2k_data_comma.csv')
X = data.drop('legs', axis=1)  # input
y = data['legs']   # output
# y.value_counts()

In [79]:
gnb = GaussianNB()
accuracy = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    gnb.fit(X_train, y_train)
    prediction = gnb.predict(X_test)
    accuracy.append(accuracy_score(prediction, y_test))
valid_acc = mean(accuracy)
valid_acc

ValueError: could not convert string to float: '?'

# 3-fold cross-validtion 
Validate the accuracy of GNB using 3 fold cross validation technique. 

In [5]:
cv = cross_val_score(gnb, X, y, cv=50)
mean(cv)

0.9533333333333335

# Model Deployment

In [6]:
model_gnb = gnb.fit(X.values, y.values) # Warning resoved
input = [[4, 1, 2, 3]]
model_gnb.predict(input)

array(['Iris-virginica'], dtype='<U15')

# Comparing with Logistic Regression

In [7]:
model_logistic = LogisticRegression(max_iter=10000) # Warning resolved
accuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    model_logistic.fit(X_train, y_train)
    prediction = model_logistic.predict(X_test)
    accuracy.append(accuracy_score(prediction, y_test))
valid_acc = mean(accuracy)
valid_acc

0.96

Example: predict **Species** in the **Iris** dataset using Support Vector Machine. 
*   train the model using 80% and evaluate it based on 20% testset
*   apply svm for different kernels
*   validate the result in 100 MC runs
*   Recommend the type of flower for the first sample in the dataset using the best classifier 

In [56]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from numpy import mean

In [57]:
dataset = read_csv('dataset_CA/StudentsPerformanceEvaluation.csv')
X = dataset.drop(['GRADE', 'STUDENT ID'], axis=1)  # Input variables
y = dataset['GRADE']  # Output variable

# svm for different kernels

In [58]:
accuracy = []
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # testset is 30%
    model = SVC(kernel=i)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    accuracy.append(accuracy_score(y_test, prediction))
accuracy

[0.20454545454545456,
 0.3409090909090909,
 0.22727272727272727,
 0.20454545454545456]

# Validate the result of performance for different kernels of svc in 100 MC runs

In [11]:
accuracy = []
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    accuracy_mc = []
    for j in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=j)  # testset is 30%
        model = SVC(kernel=i, C=1.1)
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        accuracy_mc.append(accuracy_score(y_test, prediction))  # append accuracy score in each MC run
    accuracy.append(mean(accuracy_mc)) # mean of accuracy and append it in accuracy in kernel array
accuracy

[0.9740000000000002, 0.964, 0.9593333333333333, 0.2515555555555555]

# 10-fold cross validation score 

In [12]:
accuracy = []
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    model = SVC(kernel=i)
    cross_v_score = cross_val_score(model, X, y, scoring='accuracy', cv=10) # cv=10: 10-fold cross validation
    # mean of accuracy and append it in accuracy in kernel array
    accuracy.append(mean(cross_v_score))
accuracy

[0.9733333333333334,
 0.9666666666666668,
 0.9733333333333334,
 0.06666666666666668]

# Repeat the above expriment for NB classifier and logistic regression 

In [13]:
accuracy_svc = []
accuracy_gnb = []
accuracy_lr = []

model_lr = LogisticRegression(max_iter=10000)
model_gnb = GaussianNB()

cross_v_score_lr = cross_val_score(model_lr, X, y, scoring='accuracy', cv=6)
cross_v_score_gnb = cross_val_score(gnb, X, y, scoring='accuracy', cv=6)
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for i in kernel:
    model_svc = SVC(kernel=i)
    cross_v_score_svc = cross_val_score(model_svc, X, y, scoring='accuracy', cv=6)
    accuracy_svc.append(mean(cross_v_score_svc)) # mean of accuracy and append it in accuracy in kernel array
accuracy = [accuracy_svc, mean(cross_v_score_lr), mean(cross_v_score_gnb)]
accuracy

[[0.98, 0.96, 0.98, 0.16666666666666666],
 0.9666666666666667,
 0.9533333333333333]

# Use Gini index to specify the root node of the decision tree for playtennis dataset. 

# Example 1: 
use decision tree algorithm with gini and entropy rules to predict the species of flowers in the Iris dataset, and compare the result versus SVC and Multinomial logistic regression in 1000 mc runs. 

In [20]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier # decision tee algorithm for classification
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC

In [21]:
dataset = read_csv('dataset/Iris.csv')
X = dataset.drop('Species', axis=1)  # matrix of input variables
y = dataset['Species']  # output variable

In [23]:
dt = DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # split dataset
model_dt = dt.fit(X_train, y_train)  # fit the model
prediction = model_dt.predict(X_test)  # prediction
recall = recall_score(y_test, prediction, average='weighted')  # compute recall
recall
classification_report(y_test, prediction)


'                 precision    recall  f1-score   support\n\n    Iris-setosa       1.00      1.00      1.00        11\nIris-versicolor       0.89      1.00      0.94         8\n Iris-virginica       1.00      0.91      0.95        11\n\n       accuracy                           0.97        30\n      macro avg       0.96      0.97      0.96        30\n   weighted avg       0.97      0.97      0.97        30\n'

# Dictionary of Models

In [32]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

In [None]:
dataset = read_csv('dataset/Iris.csv')
X = dataset.drop('Species', axis=1)  # matrix of input variables
y = dataset['Species']  # output variable

In [33]:
def evaluate_model(model, X, y, mc_run, split):
    accuracy = [] 
    for i in range(mc_run):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)  # split dataset
        m = model.fit(X_train, y_train)  # fit the model
        prediction = m.predict(X_test)  # prediction
        accuracy.append(accuracy_score(y_test, prediction))  # compute & append accuracy
        return mean(accuracy)

In [40]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y, 100, 0.2)
    results.append(scores)
    names.append(name)
    print((name, mean(scores)))


('dt_ent', 0.9666666666666667)
('dt_gini', 0.9666666666666667)
('lr', 0.9666666666666667)
('svc_l', 1.0)
('svc_r', 0.9333333333333333)
('svc_s', 0.2)
('svc_p', 1.0)


# Evaluate the models in dictionary using k-fold cross validation

# Import Libraries

In [46]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# get_models defination

In [47]:
def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

In [48]:
def cross_evaluator(model, X, y, k_fold):
    scores = cross_val_score(model, X, y, cv=k_fold)
    return mean(scores)

# Evaluation

In [49]:
models = get_models() # Get the model list to be evaluated
results, names = list(), list()
print('The following result is the accuracy of different model in the dictionary of SML based on 6-fold cross validation')
for name, model in models.items():
    scores = cross_evaluator(model, X, y, 6)
    results.append(scores)
    names.append(name)
    print((name, mean(scores)))
# plot model performance for comparison


The following result is the accuracy of different model in the dictionary of SML based on 6-fold cross validation
('dt_ent', 0.9533333333333333)
('dt_gini', 0.96)
('lr', 0.9666666666666667)
('svc_l', 0.98)
('svc_r', 0.98)
('svc_s', 0.16666666666666666)
('svc_p', 0.96)


# Working example for classification:
1. create a dictionary of models using svc, logistic regression, GuassianNB and Decision tree with Entropy kernel. 
2. evaluate the dictionary of models using recall and validate the result in 5-fold cross validation. 
3. deploy the best model to classify the last sample in the original dataset

In [None]:
# Import Libraries
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Model Defination
def get_models():
    models = dict()
    models['dt_ent'] = DecisionTreeClassifier(criterion='entropy')
    models['dt_gini'] = DecisionTreeClassifier(criterion='gini')
    models['lr'] = LogisticRegression(max_iter=10000)
    models['svc_l'] = SVC(kernel='linear')
    models['svc_r'] = SVC()
    models['svc_s'] = SVC(kernel='sigmoid')
    models['svc_p'] = SVC(kernel='poly')
    return models

In [None]:
# data preprocesser
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)  # testset is 10%


def data_prep(X_train, X_test, y_train):

    # fit and apply the transform
    X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
    scalar = MinMaxScaler()
    N = scalar.fit_transform(X_train_under)
    X_strain = DataFrame(N)
    header = X_train.columns  # header from the original dataset
    X_strain.columns = header
    #############################
    N = scalar.fit_transform(X_test)
    X_stest = DataFrame(N)
    header = X.columns  # header from the original dataset
    X_stest.columns = header
    # undersampler

    return X_strain, X_stest, y_train_under
