In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
import warnings

warnings.filterwarnings('ignore')

In [None]:
data = ''
df = pd.read_csv(data, header=0, sep=',')
# view dimensions of dataset
df.shape

In [None]:
# preview the dataset
df.head()

In [None]:
df.columns

In [None]:
# view summary of dataset
df.info()

In [None]:
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

In [None]:
# view the categorical variables
df[categorical].head()

In [None]:
# view frequency counts of values in categorical variables
for var in categorical: 
    print(df[var].value_counts())

In [None]:
# view frequency distribution of categorical variables
for var in categorical: 
    print(df[var].value_counts()/np.float64(len(df)))

In [None]:
# check labels in classe variable
df.classe.unique()

In [None]:
# check frequency distribution of values in classe variable
df.classe.value_counts()

In [None]:
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

In [None]:
# view the numerical variables
df[numerical].head()

In [None]:
# Declare feature vector and target variable 
X = df.drop(['classe'], axis=1)
y = df['classe']

#Split data into separate training and test set 
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# check the shape of X_train and X_test
X_train.shape, X_test.shape

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
X_test.shape

In [None]:
X_test.head()

In [None]:
#cols = X_train.columns
cols = df.columns.drop("classe")

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])
X_train.head()

In [None]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB

# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, y_train)


In [None]:
import pickle

with open('', 'wb') as f:
    pickle.dump(gnb, f)

In [None]:
y_pred = gnb.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
y_pred_train = gnb.predict(X_train)
y_pred_train

In [None]:
from sklearn.metrics import accuracy_score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

In [None]:
#Check for overfitting and underfitting
# print the scores on training and test set

print('Training set score: {:.4f}'.format(gnb.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(gnb.score(X_test, y_test)))

In [None]:
# check class distribution in test set
y_test.value_counts()

In [None]:
# check null accuracy score
null_accuracy = (168/(168+129+101))
print('Null accuracy score: {0:0.4f}'. format(null_accuracy))

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0', 'None', 'None 1'], 
                                 index=['Predict Positive:1', 'Predict Negative:0', 'None', 'None 1'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
# print classification accuracy

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

In [None]:
# print classification error

classification_error = (FP + FN) / float(TP + TN + FP + FN)

print('Classification error : {0:0.4f}'.format(classification_error))

In [None]:
# print precision score

precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))

In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

In [None]:
#True Positive Rate. True Positive Rate is synonymous with Recall.
true_positive_rate = TP / float(TP + FN)


print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))

In [None]:
#False Positive Rate
false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

In [None]:
#Specificity
specificity = TN / (TN + FP)
print('Specificity : {0:0.4f}'.format(specificity))

In [None]:
# print the first 10 predicted probabilities of two classes- 0 and 1

y_pred_prob = gnb.predict_proba(X_test)[0:10]

y_pred_prob

In [None]:
# store the probabilities in dataframe

y_pred_prob_df = pd.DataFrame(data=y_pred_prob, columns=['Prob of - food', 'Prob of - sport', 'Prob of - art', 'Prob of - others'])
y_pred_prob_df

In [None]:
 #print the first 10 predicted probabilities for class 1 - Probability of >50K

gnb.predict_proba(X_test)[0:10, 1]

In [None]:
# store the predicted probabilities for class 1 - Probability of >50K

y_pred1 = gnb.predict_proba(X_test)[:, 1]

In [None]:
# plot histogram of predicted probabilities


# adjust the font size 
plt.rcParams['font.size'] = 12


# plot histogram with 10 bins
plt.hist(y_pred1, bins = 10)


# set the title of predicted probabilities
plt.title('Histogram of predicted probabilities of sport')


# set the x-axis limit
plt.xlim(0,1)


# set the title
plt.xlabel('Predicted probabilities of food')
plt.ylabel('Frequency')