In [None]:
import shutil
shutil.copy2("/content/drive/My Drive/ML Offline/Data.zip?dl=0", "/content")

'/content/Data.zip?dl=0'

In [None]:
!unzip /content/drive/"My Drive"/"ML Offline"/Data.zip?dl=0

Archive:  /content/drive/My Drive/ML Offline/Data.zip?dl=0
   creating: Data/Test/
  inflating: Data/Test/3d_Printer.xml  
  inflating: Data/Test/Anime.xml     
  inflating: Data/Test/Arduino.xml   
  inflating: Data/Test/Astronomy.xml  
  inflating: Data/Test/Biology.xml   
  inflating: Data/Test/Chess.xml     
  inflating: Data/Test/Coffee.xml    
  inflating: Data/Test/Cooking.xml   
  inflating: Data/Test/Law.xml       
  inflating: Data/Test/Space.xml     
  inflating: Data/Test/Windows_Phone.xml  
  inflating: Data/Test/Wood_Working.xml  
  inflating: Data/topics.txt         
   creating: Data/Training/
  inflating: Data/Training/3d_Printer.xml  
  inflating: Data/Training/Anime.xml  
  inflating: Data/Training/Arduino.xml  
  inflating: Data/Training/Astronomy.xml  
  inflating: Data/Training/Biology.xml  
  inflating: Data/Training/Chess.xml  
  inflating: Data/Training/Coffee.xml  
  inflating: Data/Training/Cooking.xml  
  inflating: Data/Training/Law.xml   
  inflating: Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from sys import stdout
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess(text):
  #Lowercase the text
  text = text.lower()

  #Number Removal
  text = re.sub(r'[-+]?\d+', '', text)

  #Remove punctuations
  text=text.translate((str.maketrans('','',string.punctuation)))

  #Tokenize
  text = word_tokenize(text)

  #Remove stopwords
  stop_words = set(stopwords.words('english'))
  text = [word for word in text if not word in stop_words]

  #Lemmatize tokens
  lemmatizer=WordNetLemmatizer()
  text = [lemmatizer.lemmatize(word) for word in text]

  #Stemming tokens
  stemmer= PorterStemmer()
  text = [stemmer.stem(word) for word in text]
  return text

In [None]:
data_path = "/content/Data/Training"
data_files_list = os.listdir(data_path)
data_files_list.remove('3d_Printer.xml')
#data_files_list = ['Coffee.xml', 'Arduino.xml', 'Anime.xml']
training_data_dict_list = []
valid_data_dict_list = []
test_data_dict_list = []
for data_file in tqdm(data_files_list):
  data_file_path = data_path + "/" + data_file
  topic = data_file.split(".")[0]
  #print(topic)
  with open(data_file_path, "r") as f:
    line_count = 0
    for line in f:
      line = line.lstrip()
      if line.startswith("<row"):
        #line_count += 1
        data_dict = {}
        soup = BeautifulSoup(line, "xml")
        body_temp = soup.find("row")["Body"]
        soup2 = BeautifulSoup(body_temp, "html")
        body = soup2.get_text().replace("\n", " ")
        if body.strip() == "":
          continue
        words_list = preprocess(body)
        if len(words_list) == 0:
          continue
        line_count += 1
        data_dict['doc'] = words_list
        data_dict['topic'] = topic
        if line_count <= 500:
          training_data_dict_list.append(data_dict)
        elif line_count > 500 and line_count <=700:
          valid_data_dict_list.append(data_dict)
        elif line_count > 700 and line_count <= 1200:
          test_data_dict_list.append(data_dict)   
      if line_count >= 1200: 
        break

100%|██████████| 11/11 [00:38<00:00,  3.52s/it]


In [None]:
print(len(training_data_dict_list))
print(len(test_data_dict_list))
print(len(valid_data_dict_list))

5500
5500
2200


In [None]:
with open("train.json", "w") as f:
  json.dump(training_data_dict_list, f)

with open("test.json", "w") as f:
  json.dump(test_data_dict_list, f)

with open ("valid.json", "w") as f:
  json.dump(valid_data_dict_list, f)

In [None]:
with open("train.json", "r") as f:
  train_data = json.load(f)

with open("valid.json", "r") as f:
  valid_data = json.load(f)

with open("test.json", "r") as f:
  test_data = json.load(f)

In [None]:
def get_vocabulary(train_data):
  all_train_words = []
  for d in train_data:
    for word in d['doc']:
      all_train_words.append(word)
  vocabulary = np.unique(all_train_words)
  return vocabulary

In [None]:
def get_classes(train_data):
  classes = np.unique([d['topic'] for d in train_data])
  return classes

In [None]:
def calculate_total_words_in_each_class(train_data, classes):
  total_words_in_class = {} #Nck
  for cls in classes:
    total_words_in_class[cls] = 0
  for d in tqdm(train_data):
    total_words_in_class[d['topic']] += len(d['doc'])
  return total_words_in_class

In [None]:
def calculate_number_of_each_word_in_each_class(train_data, classes, vocabulary):
  word_in_each_class = {}
  for word in vocabulary:
    word_in_each_class[word] = {}
    for cls in classes:
      word_in_each_class[word][cls] = 0

  for word in tqdm(vocabulary):
    for d in train_data:
      word_in_each_class[word][d['topic']] += d['doc'].count(word)
  return word_in_each_class

In [None]:
def renew_probabilities(classes, vocabulary):
  p_word_class = {}
  for word in vocabulary:
    p_word_class[word] = {}
    for cls in classes:
      p_word_class[word][cls] = 0
  return p_word_class

In [None]:
def train(alpha, vocabulary, classes, word_in_each_class, total_words_in_class):
  V = len(vocabulary)
  p_word_class = renew_probabilities(classes, vocabulary)
  for word in tqdm(vocabulary):
    for cls in classes:
      p_word_class[word][cls] = np.longdouble((np.longdouble(word_in_each_class[word][cls] + alpha))/(np.longdouble(total_words_in_class[cls] + (alpha * V))))
  return p_word_class

In [None]:
def predict(predict_data, p_word_class, classes, p_unknown_word):
  probability_count = {}
  for cls in classes:
    probability_count[cls] = 1
  words = predict_data['doc']
  real_topic = predict_data['topic']

  for cls in classes:
    for word in words:
      if p_word_class.get(word) != None:
        probability_count[cls] = probability_count[cls] * p_word_class[word][cls]
      else:
        probability_count[cls] = probability_count[cls] * p_unknown_word[cls]

  predicted_topic = ""
  max_p = -999999
  for cls in classes:
    if probability_count[cls] > max_p:
      max_p = probability_count[cls]
      predicted_topic = cls 
  
  return predicted_topic
    

In [None]:
classes = get_classes(train_data)

In [None]:
total_words_in_class = calculate_total_words_in_each_class(train_data, classes)

100%|██████████| 5500/5500 [00:00<00:00, 536805.32it/s]


In [None]:
vocabulary = get_vocabulary(train_data)

In [None]:
word_in_each_class = calculate_number_of_each_word_in_each_class(train_data, classes, vocabulary)

100%|██████████| 20620/20620 [03:18<00:00, 103.76it/s]


In [None]:
def calculate_accuracy(test_data, p_word_class, classes, p_unknown_word):
  correct_count = 0
  for data in tqdm(test_data):
    real = str(data['topic'])

    predicted = str(predict(data, p_word_class, classes, p_unknown_word))

    if real == predicted:
      correct_count += 1
  
  accuracy = (correct_count/len(test_data)) * 100
  return accuracy

In [None]:
alpha = 0.2

In [None]:
p_word_class = train(alpha, vocabulary, classes, word_in_each_class, total_words_in_class)

100%|██████████| 20620/20620 [00:00<00:00, 30183.97it/s]


In [None]:
p_unknown_word = {}
for cls in classes:
  p_unknown_word[cls] = alpha/(total_words_in_class[cls] + (alpha * len(vocabulary)))

In [None]:
calculate_accuracy(valid_data, p_word_class, classes, p_unknown_word)

100%|██████████| 2200/2200 [00:01<00:00, 1873.48it/s]


91.31818181818183

In [None]:
small_test_sets = {}
for i in range(1, 51):
  small_test_sets[i] = []

In [None]:
with open("test.json", "r") as f:
  data = json.load(f)
  line_count = 0
  test_set_count = 1
  for line in data:
    line_count += 1
    small_test_sets[test_set_count].append(line)
    if line_count % 10 == 0:
      test_set_count += 1
    if line_count % 500 == 0:
      test_set_count = 1 

In [None]:
accuracy_list = []
for i in range(1, 51):
  test_data = small_test_sets[i]
  accuracy = calculate_accuracy(test_data, p_word_class, classes, p_unknown_word)
  accuracy_list.append(accuracy)

100%|██████████| 110/110 [00:00<00:00, 1469.38it/s]
100%|██████████| 110/110 [00:00<00:00, 2069.00it/s]
100%|██████████| 110/110 [00:00<00:00, 2057.87it/s]
100%|██████████| 110/110 [00:00<00:00, 1923.05it/s]
100%|██████████| 110/110 [00:00<00:00, 1495.29it/s]
100%|██████████| 110/110 [00:00<00:00, 1585.21it/s]
100%|██████████| 110/110 [00:00<00:00, 1678.60it/s]
100%|██████████| 110/110 [00:00<00:00, 2013.26it/s]
100%|██████████| 110/110 [00:00<00:00, 1903.93it/s]
100%|██████████| 110/110 [00:00<00:00, 1879.07it/s]
100%|██████████| 110/110 [00:00<00:00, 1894.29it/s]
100%|██████████| 110/110 [00:00<00:00, 1780.92it/s]
100%|██████████| 110/110 [00:00<00:00, 1754.20it/s]
100%|██████████| 110/110 [00:00<00:00, 1712.06it/s]
100%|██████████| 110/110 [00:00<00:00, 1675.77it/s]
100%|██████████| 110/110 [00:00<00:00, 1777.49it/s]
100%|██████████| 110/110 [00:00<00:00, 1930.29it/s]
100%|██████████| 110/110 [00:00<00:00, 1737.69it/s]
100%|██████████| 110/110 [00:00<00:00, 1430.16it/s]
100%|███████

In [None]:
for i in range(len(accuracy_list)):
  print(str(i+1) + ": " + str(accuracy_list[i]))

1: 92.72727272727272
2: 95.45454545454545
3: 96.36363636363636
4: 90.0
5: 96.36363636363636
6: 93.63636363636364
7: 95.45454545454545
8: 93.63636363636364
9: 90.0
10: 90.9090909090909
11: 92.72727272727272
12: 95.45454545454545
13: 90.0
14: 88.18181818181819
15: 90.9090909090909
16: 90.0
17: 91.81818181818183
18: 92.72727272727272
19: 91.81818181818183
20: 88.18181818181819
21: 84.54545454545455
22: 97.27272727272728
23: 94.54545454545455
24: 93.63636363636364
25: 89.0909090909091
26: 89.0909090909091
27: 87.27272727272727
28: 90.0
29: 87.27272727272727
30: 94.54545454545455
31: 90.9090909090909
32: 93.63636363636364
33: 92.72727272727272
34: 94.54545454545455
35: 90.0
36: 92.72727272727272
37: 90.0
38: 90.9090909090909
39: 96.36363636363636
40: 96.36363636363636
41: 89.0909090909091
42: 95.45454545454545
43: 90.0
44: 90.0
45: 93.63636363636364
46: 93.63636363636364
47: 89.0909090909091
48: 88.18181818181819
49: 88.18181818181819
50: 94.54545454545455


In [None]:
temp_dict = {}
temp_dict['NB'] = accuracy_list
with open("nb_acc.json", "w") as f:
  json.dump(temp_dict, f)
  

In [None]:
shutil.copy2("/content/nb_acc.json", "/content/drive/My Drive/ML Offline")

'/content/drive/My Drive/ML Offline/nb_acc.json'

In [None]:
#Calculating t_stat
nb_acc = accuracy_list

In [None]:
shutil.copy2("/content/drive/My Drive/ML Offline/knn_acc.json", "/content")
with open("knn_acc.json", "r") as f:
  knn_acc = json.load(f)

In [None]:
knn_acc = knn_acc['knn']

In [None]:
with open("nb_acc.json", "r") as f:
  nb_acc = json.load(f)

In [None]:
nb_acc = nb_acc['NB']

In [None]:
nb_acc = np.array(nb_acc)
knn_acc = np.array(knn_acc)

In [None]:
from scipy import stats
t_stat, p_value = stats.ttest_ind(nb_acc , knn_acc)

In [None]:
print(t_stat)

29.271637869313974


In [None]:
print(p_value)

3.0336649885380764e-50
