# Naive Bayes 1

### Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

## Example #1

### Load data

In [2]:
df = pd.read_csv("data/diabetes.csv")

### Examine data

In [3]:
df.shape

(995, 3)

In [4]:
df.head()

Unnamed: 0,glucose,bloodpressure,diabetes
0,40,85,0
1,40,92,0
2,45,63,1
3,45,80,0
4,40,73,1


In [5]:
df.isnull().sum()

glucose          0
bloodpressure    0
diabetes         0
dtype: int64

### Prepare data for model training

In [6]:
# Separate dependent and independent variables
X = df.drop("diabetes", axis = 1)
y = df["diabetes"]

In [7]:
# Split into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

### Train Gaussian Naive Bayes classifier

In [8]:
model = GaussianNB()
model.fit(X_train, y_train)

In [9]:
predict = model.predict(X_test)

### Evaluate Gaussian Naive Bayes model performance

In [10]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.9339
Accuracy on test set: 0.9331


## Example #2

### Load data

In [11]:
df = pd.read_csv("data/breast_cancer_diagnosis.csv")

### Examine data

In [12]:
df.shape

(569, 13)

In [13]:
df.head()

Unnamed: 0,id,name,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
0,ID842302,Glynnis Munson,,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35,1
1,ID842517,Lana Behrer,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27,1
2,ID84300903,Devondra Vanvalkenburgh,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31,1
3,ID84348301,Glory Maravalle,,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49,1
4,ID84358402,Mellie Mccurdy,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20,1


In [14]:
df.isnull().sum()

id                    0
name                  0
radius               71
texture               0
perimeter             0
area                  0
smoothness            0
compactness           0
concavity             0
symmetry              0
fractal_dimension     0
age                   0
diagnosis             0
dtype: int64

In [15]:
# Replace missing values (NaN) with average of column
df['radius'].fillna(value=df['radius'].mean(), inplace=True)

In [16]:
df.isnull().sum()

id                   0
name                 0
radius               0
texture              0
perimeter            0
area                 0
smoothness           0
compactness          0
concavity            0
symmetry             0
fractal_dimension    0
age                  0
diagnosis            0
dtype: int64

### Prepare data for model training

In [17]:
# Drop unnecessary (non-numeric) columns
df.drop(["id", "name"], axis = 1, inplace = True)

# Separate dependent and independent variables
X = df.drop("diagnosis", axis = 1)
y = df["diagnosis"]

In [18]:
# Split into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

### Train Gaussian Naive Bayes classifier

In [19]:
# model = GaussianNB()     # No need to do this since we already instantiated it earlier
model.fit(X_train, y_train)

In [20]:
predict = model.predict(X_test)

### Evaluate Gaussian Naive Bayes model performance

In [21]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.9045
Accuracy on test set: 0.9357
