In [1]:
from post_parser_record import PostParserRecord
from collections import Counter

## Getting the top-20 frequent tags in LawSE -- There is a reason for passing 21
def get_frequent_tags(post_parser, topk=21):
  lst_tags = []
  for question_id in post_parser.map_questions:
    question = post_parser.map_questions[question_id]
    creation_date_year = int(question.creation_date.split("-")[0])
    tag = question.tags[0]
    lst_tags.append(tag)
  tag_freq_dic = dict(Counter(lst_tags))
  tag_freq_dic = dict(sorted(tag_freq_dic.items(), key=lambda item: item[1], reverse=True))
  return list(tag_freq_dic.keys())[:topk]

In [2]:
from bs4 import BeautifulSoup
import string
import re

def preprocess_text(text):
    #preprocess text
    soup = BeautifulSoup(text)
    soup_text = soup.findAll(string=True)
    joined_soup_text = ' '.join(soup_text)
    translator = str.maketrans('', '', string.punctuation)
    removed_punctuation = joined_soup_text.translate(translator)
    final_text = re.sub(r'[\n\t]', ' ', removed_punctuation)
    return final_text



In [3]:

# Getting dictionary of train and test samples in form of
# key: tag value: list of tuples in form of (title, body)
def build_train_test(post_parser, lst_frequent_tags):
  dic_training = {}
  dic_test = {}
  for question_id in post_parser.map_questions:
    question = post_parser.map_questions[question_id]
    creation_date_year = int(question.creation_date.split("-")[0])
    tag = question.tags[0]
    if tag in lst_frequent_tags:
      #title = preprocess_text(question.title.lower())  
      #body = preprocess_text(question.body.lower())
      title = question.title.lower()
      body = question.body.lower()
      if creation_date_year > 2021:
        if tag in dic_test:
          dic_test[tag].append((title, body))
        else:
          dic_test[tag] = [(title, body)]
      else:
        if tag in dic_training:
          dic_training[tag].append((title, body))
        else:
          dic_training[tag] = [(title, body)]
  return dic_test, dic_training

In [4]:
def build_dictionaries(dic):
    title = {}
    body = {}
    both = {}
    for key, values in dic.items():
        for value in values:
            title_text = preprocess_text(str(value[0]))
            body_text = preprocess_text(str(value[1]))
            both_text = title_text + " " + body_text
            if key in title:
                title[key].append(title_text)
            else:
                title[key] = [title_text]
            if key in body:
                body[key].append(body_text)
            else:
                body[key] = [body_text]
            if key in both:
                both[key].append(both_text)
            else:
                both[key] = [both_text]
    return title, body, both


In [5]:
post_parser = PostParserRecord("Posts_law.xml")
lst_frequent_tags = get_frequent_tags(post_parser)
# We removed contract as it had no post after 2021
lst_frequent_tags.remove("contract")
dic_test, dic_training = build_train_test(post_parser, lst_frequent_tags)
print("class\t#training\t#test")
for item in dic_training:
  print(str(item) + "\t" +str(len(dic_training[item]))+"\t"+str(len(dic_test[item])))
dic_training_title, dic_training_body, dic_training_both = build_dictionaries(dic_training)
dic_test_title, dic_test_body, dic_test_both = build_dictionaries(dic_test)

class	#training	#test
criminal-law	948	78
copyright	2016	181
united-states	5668	863
united-kingdom	1195	271
employment	238	36
international	316	43
canada	382	35
intellectual-property	301	29
england-and-wales	165	138
european-union	219	30
licensing	241	29
california	391	41
internet	416	39
business	171	7
rental-property	158	20
software	292	33
contract-law	1065	111
privacy	351	23
constitutional-law	177	21
gdpr	435	63


  soup = BeautifulSoup(text)


In [59]:
import numpy as np
from collections import defaultdict

class NaiveBayesClassifier:
    def __init__(self):
        self.vocab = set()
        self.priors = {}
        self.likelihoods = defaultdict(lambda: defaultdict(int))
    
    def train(self, train_dict):
        # Compute the prior probabilities
        total_docs = sum(len(train_dict[tag]) for tag in train_dict)
        for tag in train_dict:
            self.priors[tag] = len(train_dict[tag]) / total_docs

        # Set the prior probabilities to be uniform if they are all the same
        if len(set(self.priors.values())) == 1:
            uniform_prior = 1 / len(self.priors)
            for tag in self.priors:
                self.priors[tag] = uniform_prior

        # Build the vocabulary
        for tag in train_dict:
            for text in train_dict[tag]:
                words = text.split()
                self.vocab.update(words)
        # Compute the conditional probabilities with Laplace smoothing
        alpha = 0.76  # Laplace smoothing parameter
        for tag in train_dict:
            tag_docs = train_dict[tag]
            tag_word_counts = defaultdict(int)
            for text in tag_docs:
                words = text.split()
                for word in words:
                    tag_word_counts[word] += 1
            total_words = sum(tag_word_counts.values())
            for word in self.vocab:
                self.likelihoods[tag][word] = (tag_word_counts[word] + alpha) / (total_words + alpha * len(self.vocab))

    def predict(self, text):
        words = text.split()
        probs = {tag: np.log(self.priors[tag]) for tag in self.priors}
        for word in words:
            if word not in self.vocab:
                continue
            for tag in self.likelihoods:
                probs[tag] += np.log(self.likelihoods[tag][word])
        return max(probs, key=probs.get)


In [60]:
def classify_dic(train, test):
    classifier = NaiveBayesClassifier()
    classifier.train(train)
    total = 0
    cnt = 0
    uni = 0
    for tag in test:
        for text in test[tag]:
            #print(text)
            predicted_tag = classifier.predict(text)
            total += 1
            if predicted_tag == 'united-states':
                uni += 1
            if predicted_tag == tag:
                #print(f"CORRECT Predicted tag: {predicted_tag}, Actual tag: {tag}")
                cnt += 1
            #else:
                #print(f"ERROR Predicted tag: {predicted_tag}, Actual tag: {tag}")
    print("correct predictions " + str(cnt))
    print("wrong predictions " + str(total - cnt)) 
    print("total predictions " + str(total))    
    print("united states predictions " + str(uni))

In [61]:
print("title")
classify_dic(dic_training_title, dic_test_title)
print("body")
classify_dic(dic_training_body, dic_test_body)
print("title+body")
classify_dic(dic_training_both, dic_test_both)


title
correct predictions 1052
wrong predictions 1039
total predictions 2091
united states predictions 1505
body
correct predictions 1054
wrong predictions 1037
total predictions 2091
united states predictions 1342
title+body
correct predictions 1076
wrong predictions 1015
total predictions 2091
united states predictions 1268
