# Importing important libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Loading data

In [None]:
DIR="../input/mushroom-classification/mushrooms.csv"

In [None]:
df=pd.read_csv(DIR)

In [None]:
# Print first 20 rows.
df.head(20)

# Data Wrangling

In [None]:
#Finding the missing values in our dataset.
missing_value=df.isnull()

In [None]:
# Printing the columns with missing values. [False- Number of non-missing values; True- Number of missing values]
for columns in missing_value.columns.values.tolist():
    print (columns)
    final=missing_value[columns].value_counts()
    print (final)

In [None]:
df.describe()

In [None]:
# Dataset shape is 8124 rows and 23 columns.
df.shape

In [None]:
# Applying label encoder to transform labels into numeric form so as to convert it into the machine-readable form
df1=df.apply(LabelEncoder().fit_transform)

In [None]:
df1.head(10)

In [None]:
# Describe function will calculate the mean, standard deviation, min value, max value, values under 25 percentile, 50 percentile and 75 percentile respectively.
df1.describe()

In [None]:
# Correlation function will calcuate the correlation of each feature with each other feature.
df1.corr()

In [None]:
# Plotting heat map to visualize the correlation of each feature.
sns.heatmap(df1.corr())
plt.show()

In [None]:
# Since correlation of feature 'veil-type' is very low with other features, we can drop this feature as it will not help in the classification.
df2=df1.drop(['veil-type'],axis=1)

In [None]:
# Shape of our dataset after dropping the 'veil-type' feature.
df2.shape

In [None]:
sns.set_style('darkgrid')
sns.regplot(df2['cap-shape'],df2['class'])
plt.show()

In [None]:
# Calculating the pearson coefficient and p-value. In this, pearson coefficient is 0.0529 which means the feature 'cap-shape' is not much positively linearly dependent on target variable which is 'class'.
#p-value indicates the probability for strong coorelation. In this case, p-value is <0.001 which is the prediction of high correlation. 
scipy.stats.pearsonr(df2['cap-shape'],df2['class'])

In [None]:
sns.regplot(df2['cap-surface'],df2['class'])

In [None]:
#Here pearson coefficient is 0.178 which is positive but not close to 1 hence it is not much linealy dependent on 'class'.
# p-value is <0.001 which is the prediction for high correlation. 
scipy.stats.pearsonr(df2['cap-surface'],df2['class'])

In [None]:
sns.regplot(df2['gill-spacing'],df2['class'])

In [None]:
#Pearson Coefficient is -0.348 which is negative and little bit closer to 1. Hence, it will be negatively linear dependent on 'class'.
#p-value is <0.0001 which is the indication for high correlation.
scipy.stats.pearsonr(df2['gill-spacing'],df2['class'])

# Data Preprocessing

In [None]:
# Dividing the dataset into X (features) and y (target) variables.
X=df2.drop(df1[['class']],axis=1)
y=df2[['class']]

In [None]:
# Using train_test_split function to divide data into training and test dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print ('size of X train data and y train data is {} and {} respectively.'.format(X_train.shape, y_train.shape))
print ('size of X test data and y test data is {} and {} respectively'.format(X_test.shape, y_test.shape))

In [None]:
#Converting the train and test dataset into numpy array.
X_Train=np.array(X_train)
X_Test=np.array(X_test)
y_Train=np.array(y_train).reshape(-1,)
y_Test=np.array(y_test).reshape(-1,)

## CUSTOM NAIVE BAYES IMPLEMENTATION

# PRIOR PROBABILITY


In [None]:
def prior_probability(y_t,labels):
    for i in range(y_t.shape[0]):
        numerator= (y_t== labels).sum()
    denominator= y_t.shape[0]
    return numerator/denominator


In [None]:
prior_probability(y_Train,0)

# Posterior Probability 

In [None]:
def prior_probability(y_Train,label):
    for i in range(y_Train.shape[0]):
        numerator= (y_Train== label).sum()
    denominator= y_Train.shape[0]
    return numerator/denominator



#Function to calculate the conditional probability.
def cond_prob(X_Train,y_Train,feature_col,feature_val,label):
    x_fil=X_Train[y_Train==label]
    num=np.sum(x_fil[:,feature_col]==feature_val)
    den=np.sum(y_Train==label)
    return num/float(den)

def prediction(X_Train,y_Train,X_Test):
    L= np.unique(y_Train)
    n=X_Train.shape[1]
    pp=[]
    for label in L:
        Likelihood=1
        for k in range(n):
            cond= cond_prob(X_Train,y_Train,k,X_Test[k],label)
            Likelihood*=cond
        prior_prob=prior_probability(y_Train,label)
        posterior_p= Likelihood*prior_prob
        pp.append(posterior_p)
    return np.argmax(pp)

#Function to calculate the accuracy of our prediction.
def score(X_Train,y_Train,X_Test,y_Test):
    pred=[]
    for i in range(X_Test.shape[0]):
        pred_list=prediction(X_Train,y_Train,X_Test[i])
        pred.append(pred_list)
    pred=np.array(pred)
    accuracy=(np.sum(pred==y_Test)/y_Test.shape[0])
    return accuracy
    
    

In [None]:
print (prediction(X_Train,y_Train,X_Test[4]))
print (y_Test[4])

In [None]:
# Accuracy is 99 percent.
print (score(X_Train,y_Train,X_Test,y_Test))