In [7]:
#Importing packages
from sklearn import datasets
import pandas as pd
import statsmodels.api as sm

# Import Gaussian Naive Bayes Model
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import warnings
warnings.filterwarnings("ignore")
import numpy as np

In [8]:
# IRIS Dataset
iriss = datasets.load_iris()
iris = pd.DataFrame(iriss.data)
iris.columns = iriss.feature_names
iris['species'] = iriss.target
iris.columns = iris.columns.str.replace(' ', '')
iris.columns = iris.columns.str.replace('(','')
iris.columns = iris.columns.str.replace(')','')
iris.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
# Target Column Distribution
iris['species'].value_counts()

species
0    50
1    50
2    50
Name: count, dtype: int64

from the above output, we see that data is balanced

In [10]:
# Distribution ( mean) of Independent Columns respect to Dependent Column
iris.groupby('species').mean().round(2)

Unnamed: 0_level_0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.01,3.43,1.46,0.25
1,5.94,2.77,4.26,1.33
2,6.59,2.97,5.55,2.03


In [11]:
iris.describe()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


from the above output, we can find if there are any errors in input.

In [12]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sepallengthcm  150 non-null    float64
 1   sepalwidthcm   150 non-null    float64
 2   petallengthcm  150 non-null    float64
 3   petalwidthcm   150 non-null    float64
 4   species        150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


from the above ouput, we can draw the information about the data

In [13]:
# Independent variables
Independent_Variable_Base_Set = iris[iris.columns[0:4]]
Independent_Variable_Base_Set.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
# Dependent variable
Dependent_Variable = iris[iris.columns[-1:iris.columns.size]]
Dependent_Variable.head()

Unnamed: 0,species
0,0
1,0
2,0
3,0
4,0


In [15]:
# Split the Dataset

# Model 1
# Lets start with selecting one variable
Independent_Variable_Set_v1 = iris[iris.columns[0:1]]
X_train, X_test, y_train, y_test = train_test_split(Independent_Variable_Set_v1, Dependent_Variable, test_size = 0.3, random_state = 21)

# Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
result = model.fit(X_train, y_train)
dir(result)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_X',
 '_check_feature_names',
 '_check_n_features',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_joint_log_likelihood',
 '_more_tags',
 '_parameter_constraints',
 '_partial_fit',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_update

In [22]:
# Model Prediction
print("Sample Prediction of Model 1")
pred = result.predict(X_test)
model_prediction = pd.DataFrame(pred.round(2), columns = ['Prediction'])
model_prediction['Prediction'].head()

Sample Prediction of Model 1


0    1
1    0
2    0
3    0
4    1
Name: Prediction, dtype: int64

In [21]:
# Test Set Target Column Distribution
print("\nTest Set Distribution")
y_test['species'].value_counts()


Test Set Distribution


species
1    16
2    15
0    14
Name: count, dtype: int64

In [23]:
# Predicted Set Target Column Distribution
print("\nPredicted Set Distribution")
model_prediction['Prediction'].value_counts()


Predicted Set Distribution


Prediction
2    17
1    14
0    14
Name: count, dtype: int64

Inference: We cannot infer anything with a single column (as there is difference between test set and predicted set), hence, we will use all the columns

In [27]:
# Model Performance

y_pred = model_prediction[['Prediction']]
model_1_accuracy = accuracy_score(y_test, y_pred)
print("Model 1 Performance")
print("\nModel 1, Accuracy :",model_1_accuracy)
model_1_precision = precision_score(y_test, y_pred, average='micro')
print("Model 1, Precision :",model_1_precision)
model_1_recall = recall_score(y_test, y_pred, average='micro')
print("Model 1, Recall :",model_1_recall)
model_1_fscore = f1_score(y_test, y_pred, average='micro')
print("Model 1, F1 Score :",model_1_fscore)
print("\nConfusion Matrix, Model 1")
model_1_cm = confusion_matrix(y_test,y_pred)
print(model_1_cm)
print("\nClassification Report, Model 1")

Model 1 Performance

Model 1, Accuracy : 0.9555555555555556
Model 1, Precision : 0.9555555555555556
Model 1, Recall : 0.9555555555555556
Model 1, F1 Score : 0.9555555555555556

Confusion Matrix, Model 1
[[14  0  0]
 [ 0 14  2]
 [ 0  0 15]]

Classification Report, Model 1


In [28]:
# Model 2
# Lets build model with all variables

X_train, X_test, y_train, y_test = train_test_split(Independent_Variable_Base_Set, Dependent_Variable, test_size = 0.3, random_state = 21)

# Train the model using the training sets
result = model.fit(X_train, y_train)

# Model Prediction
print("Sample Prediction of Model 2")
pred = result.predict(X_test)
model_prediction = pd.DataFrame(pred.round(2), columns = ['Prediction'])
print(model_prediction['Prediction'].head())

# Test Set Target Column Distribution
print("\nTest Set Distribution")
print(y_test['species'].value_counts())

# Model Performance
y_pred = model_prediction[['Prediction']]
model_1_accuracy = accuracy_score(y_test, y_pred)
print("Model 1 Performance")
print("\nModel 1, Accuracy :",model_1_accuracy)
model_1_precision = precision_score(y_test, y_pred, average='micro')
print("Model 1, Precision :",model_1_precision)
model_1_recall = recall_score(y_test, y_pred, average='micro')
print("Model 1, Recall :",model_1_recall)
model_1_fscore = f1_score(y_test, y_pred, average='micro')
print("Model 1, F1 Score :",model_1_fscore)
print("\nConfusion Matrix, Model 1")
model_1_cm = confusion_matrix(y_test,y_pred)
print(model_1_cm)
print("\nClassification Report, Model 1")
model_1_cr = classification_report(y_test,y_pred)
print(model_1_cr)

print("Inference : Good Fit, can we make it better ?")

Sample Prediction of Model 2
0    1
1    0
2    0
3    0
4    1
Name: Prediction, dtype: int64

Test Set Distribution
species
1    16
2    15
0    14
Name: count, dtype: int64
Model 1 Performance

Model 1, Accuracy : 0.9555555555555556
Model 1, Precision : 0.9555555555555556
Model 1, Recall : 0.9555555555555556
Model 1, F1 Score : 0.9555555555555556

Confusion Matrix, Model 1
[[14  0  0]
 [ 0 14  2]
 [ 0  0 15]]

Classification Report, Model 1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.88      0.93        16
           2       0.88      1.00      0.94        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

Inference : Good Fit, can we make it better ?
