# Working With Scikit Learn

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# Laod Data

In [26]:
data_frame=pd.read_csv('data_sets/data.csv')

In [27]:
data_frame.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.379,True


## Delete Correlated Feature
As we see befor **Skin and Thickness** are correlated with each other. So having both feature doesn't make any sence to our model but they arise confusing for our model and impact on performance. So here we remove one of colunm from our dataset.

In [28]:
# deleting skin collumn completly
del data_frame['skin']
print(data_frame.shape)
data_frame.head()

(768, 9)


Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


## Molding Dataset


In [29]:
# Mapping the values
map_diabetes = {True : 1, False : 0}

# Setting the map to the data_frame
data_frame['diabetes'] = data_frame['diabetes'].map(map_diabetes)

data_frame.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Split Train and Test data
By this process we split our dataset by two segment. Here we normally split 70% data for Training our model and 
30% data for Testing our model.

Here we use **random state=42** to gerrunty that when we go for spliting data set every time it start from the same location.

In [30]:
from sklearn.model_selection import train_test_split
feature_column_names=['num_preg','glucose_conc','diastolic_bp','thickness','insulin','bmi','diab_pred','age']
predicted_class_name=['diabetes']

#getting our featured values
x=data_frame[feature_column_names].values
y=data_frame[predicted_class_name].values

split_test_size=0.30
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = split_test_size, random_state = 42)

# Print The Result

In [31]:
print("{0:0.2f}% in training set".format((len(X_train)/len(data_frame.index)) * 100))
print("{0:0.2f}% in test set".format((len(X_test)/len(data_frame.index)) * 100))

69.92% in training set
30.08% in test set


# Check Is Our Dataset Contain 0 or Null Value or Missing Value

In [32]:
print("# rows in dataframe {0}".format(len(data_frame)))
print("# rows missing glucose_conc: {0}".format(len(data_frame.loc[data_frame['glucose_conc'] == 0])))
print("# rows missing diastolic_bp: {0}".format(len(data_frame.loc[data_frame['diastolic_bp'] == 0])))
print("# rows missing thickness: {0}".format(len(data_frame.loc[data_frame['thickness'] == 0])))
print("# rows missing insulin: {0}".format(len(data_frame.loc[data_frame['insulin'] == 0])))
print("# rows missing bmi: {0}".format(len(data_frame.loc[data_frame['bmi'] == 0])))
print("# rows missing diab_pred: {0}".format(len(data_frame.loc[data_frame['diab_pred'] == 0])))
print("# rows missing age: {0}".format(len(data_frame.loc[data_frame['age'] == 0])))

# rows in dataframe 768
# rows missing glucose_conc: 5
# rows missing diastolic_bp: 35
# rows missing thickness: 227
# rows missing insulin: 374
# rows missing bmi: 11
# rows missing diab_pred: 0
# rows missing age: 0


# Replace 0 or Null values with Average value or something

In [19]:
from sklearn.impute import SimpleImputer,KNNImputer

# nearest Neighbour Imputer 
#imputer = KNNImputer(n_neighbors=2, weights="uniform")

#Impute with mean all 0 readings
fill_0 = SimpleImputer(missing_values=0, strategy="mean")

X_train = fill_0.fit_transform(X_train)
X_test = fill_0.fit_transform(X_test)

# Training model Using Gaussian Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with the data
nb_model=GaussianNB()

#train
nb_model.fit(X_train, y_train.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

# Predicting Trained Result

In [34]:
# This returns array of predicted results
prediction_from_trained_data = nb_model.predict(X_train)

# Performance on Training Data

In [35]:
from sklearn import metrics

# get current accuracy of the model
accuracy = metrics.accuracy_score(y_train, prediction_from_trained_data)
print ("Accuracy of our naive bayes model on training data is : {0:.4f}".format(accuracy))

Accuracy of our naive bayes model on training data is : 0.7672


# Performance on Testing Data

In [36]:
prediction_from_test_data = nb_model.predict(X_test)

accuracy=metrics.accuracy_score(y_test,prediction_from_test_data)
print ("Accuracy of our naive bayes model on testing data is : {0:.4f}".format(accuracy))

Accuracy of our naive bayes model on testing data is : 0.7446


# Confusion Matrix

Actual True     row -> 0    TP     FP

Actual False    row -> 1    FN     TNs

TP = কতগুলা ঘটনা ঘটেছে এবং  ঘটেছে হিসেবে ডিটেক্ট করেছে

FP = কতগুলা ঘটনা ঘটে নাই কিন্তু ঘটেছে হিসেবে ডিটেক্ট করেছে

FN = কতগুলা ঘটনা ঘটেছে কিন্তু ঘটে নাই হিসেবে ডিটেক্ট করেছে

TN = কতগুলা ঘটনা ঘটে নাই এবং ডিটেক্ট ও করে নাই

In [42]:
from sklearn import metrics

print("Confusion Matrix")
confusion_matrix = metrics.confusion_matrix(y_test, prediction_from_test_data,labels=[1, 0])
print(confusion_matrix)

Confusion Matrix
[[ 53  27]
 [ 32 119]]


# Classification Report

* Precision =TP/(TP + FP), The Higher The Better
* Recall = TP/(TP + FN), The Higher The Better

In [43]:
from sklearn import metrics

print ("Classification Report")
classification_report = metrics.classification_report(y_test, prediction_from_test_data ,labels=[1, 0])
print(classification_report)

Classification Report
              precision    recall  f1-score   support

           1       0.62      0.66      0.64        80
           0       0.82      0.79      0.80       151

    accuracy                           0.74       231
   macro avg       0.72      0.73      0.72       231
weighted avg       0.75      0.74      0.75       231



# How Can We Imporve Our Accuracy
* By Adjusting or modifiying algorithm.
* By Collecting More Data the Retrain.
* By Imporving Training Procedure.
* By Chaging Algorithm.