In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import string
from bs4 import BeautifulSoup
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Reading file containing Spam/Ham messages

In [None]:
df = pd.read_csv("/content/drive/My Drive/ColabFiles/message.txt", sep = "\t", header = None, names=['Label', 'SMS'])
print(df.head())

  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


Preprocessing to clean text data

In [None]:
def preprocess(s):
    soup = BeautifulSoup(s, "lxml")
    s = soup.get_text()
    s = s.lower()
    word_tokens = [porter.stem(i) for i in word_tokenize(s) if i not in string.punctuation and i not in stop_words and i.isalpha()]
    return ' '.join(word_tokens)
df1 = df.copy(deep = True)
df1['SMS'] = df1['SMS'].apply(preprocess)
print(df1['SMS'][:5])

  soup = BeautifulSoup(s, "lxml")


0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt may ...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: SMS, dtype: object


CountVectorizer for calculating bag of words

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df1['SMS']).toarray()
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Using Multinomail Naive bayes, predicting the type of message

In [None]:
y = df.iloc[:,0]
clf = MultinomialNB()
clf.fit(X, y)
y_pred = clf.predict(X)
print(y_pred)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


Calculating accuracy of predicted type of messages

In [None]:
c = 0
for i in range(len(y)):
    if y[i] == y_pred[i]:
        c += 1
print(f"Accuracy: {c / len(y) * 100}%")

Accuracy: 98.7078248384781%


Loading Iris dataset

In [None]:
iris = load_iris(as_frame = True)
df = iris.frame
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()
classes = np.unique(y)
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


Calculating mean, standard deviation and priorities

In [None]:
mean = df.iloc[:, :-1].groupby(y).apply(np.mean).to_numpy()
std = df.iloc[:, :-1].groupby(y).apply(np.std).to_numpy()
priors = df.groupby(y).apply(len).to_numpy() / len(df)
print(mean, std, priors)

[[5.006 3.428 1.462 0.246]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]] [[0.34894699 0.37525458 0.17191859 0.10432641]
 [0.51098337 0.31064449 0.46518813 0.19576517]
 [0.62948868 0.31925538 0.54634787 0.27188968]] [0.33333333 0.33333333 0.33333333]


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Calculating naive bayes probabilities using gaussian approach

In [None]:
def calculate_probability(X, mean, std):
    numerator = np.exp(-(((X - mean) / std) ** 2) / 2)
    denominator = std * np.sqrt(2 *np.pi)
    prob = numerator / denominator
    prob1 = 1
    for x in prob:
        prob1 *= x
    return prob1

Comparing each dataset entry with classes mean and standard deviation

In [None]:
def predict_prob(X):
    prob_list = []
    for cls in classes:
        prob = calculate_probability(X, mean[cls], std[cls])
        prob_list.append(prob)
    return prob_list

Using best probability value to predict class of each dataset entry

In [None]:
y_pred = []
for x in X:
    y_pred.append(np.argmax(predict_prob(x)))
print(y_pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


Calculating accuracy of Step-by-step implemented approach

In [None]:
c = 0
for i in range(len(y)):
    if y[i] == y_pred[i]:
        c += 1
print(f"Accuracy: {c / len(y) * 100}%")

Accuracy: 96.0%


Using in-built function to predict class of each dataset entry

In [None]:
nb = GaussianNB()
nb.fit(X, y)
y_pred = nb.predict(X)
print(y_pred)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

Calculating accuracy of in-built function

In [None]:
c = 0
for i in range(len(y)):
    if y[i] == y_pred[i]:
        c += 1
print(f"Accuracy: {c / len(y) * 100}%")

Accuracy: 96.0%


Loading Digits dataset and using Standard Scaler to scale them

In [None]:
digits = load_digits()
X = digits.data
scaler = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)
y = digits.target
print(X, y)

[[ 0.         -0.33501649 -0.04308102 ... -1.14664746 -0.5056698
  -0.19600752]
 [ 0.         -0.33501649 -1.09493684 ...  0.54856067 -0.5056698
  -0.19600752]
 [ 0.         -0.33501649 -1.09493684 ...  1.56568555  1.6951369
  -0.19600752]
 ...
 [ 0.         -0.33501649 -0.88456568 ... -0.12952258 -0.5056698
  -0.19600752]
 [ 0.         -0.33501649 -0.67419451 ...  0.8876023  -0.5056698
  -0.19600752]
 [ 0.         -0.33501649  1.00877481 ...  0.8876023  -0.26113572
  -0.19600752]] [0 1 2 ... 8 9 8]


Creating different parameters to test almost each type of knn approach

In [None]:
grid_params = {'n_neighbors' : [3, 5, 7, 9, 11, 13, 15, 17, 19], 'weights' : ['distance', 'uniform'], 'metric' : ['euclidean', 'manhattan']}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, n_jobs = -1)
gs_results = gs.fit(X, y)

Printing best k value with its accuracy

In [None]:
print(f'Accuracy: {gs_results.best_score_}')
print(f'Best Parameters: {gs_results.best_params_}')

Accuracy: 0.9521556793562365
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
