In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
import pandas as pd
file_name = "https://docs.google.com/spreadsheets/d/1vNECWT2ihqOFL8JPb5d6vZvBpzvJrm1kaZ9VHmzMHKw"+ '/export?gid=0&format=xlsx'
df = pd.read_excel(file_name,index_col=0, sheet_name="Sheet1",engine='openpyxl')
print(df)

             color    type    origin stolen?
example no                                  
1              red  sports  domestic     yes
2              red  sports  domestic      no
3              red  sports  domestic     yes
4           yellow  sports  domestic      no
5           yellow  sports  imported     yes
6           yellow     suv  imported      no
7           yellow     suv  imported     yes
8           yellow     suv  domestic      no
9              red     suv  imported      no
10             red  sports  imported     yes


In [8]:
X, y=df.drop(["stolen?"],axis=1),df["stolen?"]
print(X.shape)
print(y)

(10, 3)
example no
1     yes
2      no
3     yes
4      no
5     yes
6      no
7     yes
8      no
9      no
10    yes
Name: stolen?, dtype: object


In [9]:
def accuracy_score(y_true, y_pred):
	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

In [10]:
class  NaiveBayes:

	"""
		Bayes Theorem:
										Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											Predictor prior probability

							  			 P(x|c) * p(c)
							   P(c|x) = ------------------
											  P(x)
	"""

	def __init__(self):

		"""
			Attributes:
				likelihoods: Likelihood of each feature per class
				class_priors: Prior probabilities of classes
				pred_priors: Prior probabilities of features
				features: All features of dataset
		"""
		self.features = list
		self.likelihoods = {}
		self.class_priors = {}
		self.pred_priors = {}

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({feat_val+'_'+outcome:0})
					self.class_priors.update({outcome: 0})

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

	def _calc_class_prior(self):

		""" P(c) - Prior Class Probability """

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):

		""" P(x|c) - Likelihood """

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count


	def _calc_predictor_prior(self):

		""" P(x) - Evidence """

		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size


	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
					evidence *= self.pred_priors[feat][feat_val]

				# posterior = (likelihood * prior) / (evidence)
				posterior = (likelihood * prior)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			print(probs_outcome)
			results.append(result)

		return np.array(results)

In [12]:
import numpy as np

In [13]:
nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))
query = np.array([['red','sports','domestic']])
print("Query:- {} ---> {}".format(query, nb_clf.predict(query)))


{'no': 0.04800000000000001, 'yes': 0.096}
{'no': 0.04800000000000001, 'yes': 0.096}
{'no': 0.04800000000000001, 'yes': 0.096}
{'no': 0.072, 'yes': 0.06400000000000002}
{'no': 0.048, 'yes': 0.09600000000000002}
{'no': 0.072, 'yes': 0.024000000000000004}
{'no': 0.072, 'yes': 0.024000000000000004}
{'no': 0.108, 'yes': 0.016000000000000004}
{'no': 0.048, 'yes': 0.036}
{'no': 0.03200000000000001, 'yes': 0.144}
Train Accuracy: 80.0
{'no': 0.04800000000000001, 'yes': 0.096}
Query:- [['red' 'sports' 'domestic']] ---> ['yes']


In [14]:
nb_clf.pred_priors

{'color': {'red': 0.5, 'yellow': 0.5},
 'type': {'sports': 0.6, 'suv': 0.4},
 'origin': {'domestic': 0.5, 'imported': 0.5}}

In [15]:
nb_clf.class_priors

{'no': 0.5, 'yes': 0.5}

In [16]:
nb_clf.likelihoods

{'color': {'red_no': 0.4, 'red_yes': 0.6, 'yellow_no': 0.6, 'yellow_yes': 0.4},
 'type': {'sports_no': 0.4, 'sports_yes': 0.8, 'suv_no': 0.6, 'suv_yes': 0.2},
 'origin': {'domestic_no': 0.6,
  'domestic_yes': 0.4,
  'imported_no': 0.4,
  'imported_yes': 0.6}}

In [17]:
list(y.unique())

['yes', 'no']

In [18]:
df_new = df

In [20]:
df_new = df_new.replace(["red","sports","domestic","yes"], 1)
df_new = df_new.replace(["yellow","suv","imported","no"], 0)
df_new = df_new.replace(["suv","imported"], 2)
df_new

Unnamed: 0_level_0,color,type,origin,stolen?
example no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,1,1
2,1,1,1,0
3,1,1,1,1
4,0,1,1,0
5,0,1,0,1
6,0,0,0,0
7,0,0,0,1
8,0,0,1,0
9,1,0,0,0
10,1,1,0,1


In [21]:
X, y=df_new.drop(["stolen?"],axis=1),df_new["stolen?"]
print(X.shape)
print(y)

(10, 3)
example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64


In [22]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.priors = None
        self.conditional_probs = None

    def fit(self, X, y):
        """
        Fits the Naive Bayes classifier to the training data.

        Args:
            X: A numpy array of shape (n_samples, n_features) representing the features.
            y: A numpy array of shape (n_samples,) representing the target labels.
        """

        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Calculate priors
        self.priors = np.zeros(n_classes)
        for i in range(n_classes):

            self.priors[i] = np.sum(y == i) / n_samples

        # Calculate conditional probabilities
        self.conditional_probs = np.zeros((n_classes, n_features))
        for i in range(n_classes):
            X_class = X[y == i]
            for j in range(n_features):
                print(np.mean(X_class),y,i)
                # self.conditional_probs[i, j] = np.mean(X_class[:, j])
                print("hello")

    def predict(self, X):
        """
        Predicts the class labels for new data.

        Args:
            X: A numpy array of shape (n_samples, n_features) representing the new data.

        Returns:
            A numpy array of shape (n_samples,) representing the predicted class labels.
        """

        n_samples, n_features = X.shape
        n_classes = len(self.priors)

        # Calculate posterior probabilities
        posteriors = np.zeros((n_samples, n_classes))
        for i in range(n_classes):
            for j in range(n_samples):
                posteriors[j, i] = np.log(self.priors[i]) + np.sum(
                    np.log(self.conditional_probs[i, :]) * X[j, :]
                )

        # Predict the class with the highest posterior probability
        return np.argmax(posteriors, axis=1)

In [26]:
clf = NaiveBayesClassifier()
clf.fit(X, y)

0.4666666666666667 example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64 0
hello
0.4666666666666667 example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64 0
hello
0.4666666666666667 example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64 0
hello
0.6 example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64 1
hello
0.6 example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64 1
hello
0.6 example no
1     1
2     0
3     1
4     0
5     1
6     0
7     1
8     0
9     0
10    1
Name: stolen?, dtype: int64 1
hello
