Importing required packages.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from os import listdir
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Processing Datasets

---


Tokenising raw datasets. enron1, enron2 and enron4 will correspond to index 0, 1 and 2 going forward. 

In [None]:
common_filedir = "/content/drive/MyDrive/DATASET/enron"
tokenized_train_datasets = [[],[],[]]
tokenized_test_datasets = [[],[],[]]

for i in ['1', '2', '4']:
  for txt_file in listdir(common_filedir + i + "_train/enron" + i + "/train/spam/"):
    with open(common_filedir + i + "_train/enron" + i + "/train/spam/"+txt_file, 'r',encoding = 'unicode_escape') as file:
      data = file.read()
      tokenized_train_datasets[int(i)-1 if int(i) != 4 else 2].append((word_tokenize(data), 1))
    

  for txt_file in listdir(common_filedir + i + "_train/enron" + i + "/train/ham/"):  
    with open(common_filedir + i + "_train/enron" + i + "/train/ham/"+txt_file, 'r') as file:
      data = file.read()
      tokenized_train_datasets[int(i)-1 if int(i) != 4 else 2].append((word_tokenize(data), 0))

  for txt_file in listdir(common_filedir + i + "_test/enron" + i + "/test/spam/"):
    with open(common_filedir + i + "_test/enron" + i + "/test/spam/"+txt_file, 'r',encoding = 'unicode_escape') as file:
      data = file.read()
      tokenized_test_datasets[int(i)-1 if int(i) != 4 else 2].append((word_tokenize(data), 1))

  for txt_file in listdir(common_filedir + i + "_test/enron" + i + "/test/ham/"):  
    with open(common_filedir + i + "_test/enron" + i + "/test/ham/"+txt_file, 'r') as file:
      data = file.read()
      tokenized_test_datasets[int(i)-1 if int(i) != 4 else 2].append((word_tokenize(data), 0))

Creating separate vocabulary list based on the training datasets.

In [None]:
vocab = [set(), set(), set()]
for i, dataset in enumerate(tokenized_train_datasets):
  for mail, category in dataset:
    for word in mail:
      if word not in stopwords.words('english'):
        vocab[i].add(word)

In [None]:
len(vocab[0]),len(vocab[1]),len(vocab[2])

(9973, 10308, 17747)

Saving each email in dataset as count of words in a dictionary with the vocab list as keys. Using same vocab list for testing set vectorization.

In [None]:
train_data_word_count_n_output = [[],[],[]]
test_data_word_count_n_output = [[],[],[]]

for i, dataset in enumerate(tokenized_train_datasets):
  dataset_bow = []
  for mail, category in dataset:
    dict_vector = dict.fromkeys(vocab[i], 0)
    for word in mail:
      if word in vocab[i]:
        dict_vector[word] += 1
    
    dataset_bow.append([dict_vector, category])
  train_data_word_count_n_output[i] = dataset_bow

for i, dataset in enumerate(tokenized_test_datasets):
  dataset_bow = []
  for mail, category in dataset:
    dict_vector = dict.fromkeys(vocab[i], 0)
    for word in mail:
      if word in vocab[i]:
        dict_vector[word] += 1
    
    dataset_bow.append([dict_vector, category])
  test_data_word_count_n_output[i] = dataset_bow

In [None]:
(len(train_data_word_count_n_output[0][0][0].keys()),
len(train_data_word_count_n_output[1][0][0].keys()),
len(train_data_word_count_n_output[2][0][0].keys()))

(9973, 10308, 17747)

# Multinomial Naive Bayes
---


In [None]:
prior_prob_ham = [0,0,0]
prior_prob_spam = [0,0,0]
for i, dataset in enumerate(train_data_word_count_n_output):
  prior_prob_spam[i] = len([1 for count_dict, category in train_data_word_count_n_output[i] if category == 1])/len(train_data_word_count_n_output[i])
  prior_prob_ham[i] = 1 - prior_prob_spam[i]

prior_count_word_ham = [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)] 
prior_count_word_spam = [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)] 
for i, dataset in enumerate(train_data_word_count_n_output):
  for word in vocab[i]:
    for vect, cat in train_data_word_count_n_output[i]:
      if cat == 0:
        prior_count_word_ham[i][word] += vect[word]
      else:
        prior_count_word_spam[i][word] += vect[word]

sum_of_words_ham = [0,0,0]
sum_of_words_spam = [0,0,0]
for i, dataset in enumerate(train_data_word_count_n_output):
  for word,v in prior_count_word_ham[i].items(): 
    sum_of_words_ham[i] += v
  for word,v in prior_count_word_spam[i].items(): 
    sum_of_words_spam[i] += v

prior_prob_word_ham =  [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)] 
prior_prob_word_spam = [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)]
for i, dataset in enumerate(train_data_word_count_n_output):
  for word in vocab[i]:
    prior_prob_word_ham[i][word] = (prior_count_word_ham[i][word]+1)/(sum_of_words_ham[i] + sum_of_words_ham[i] -prior_count_word_ham[i][word] - 1)
    prior_prob_word_spam[i][word] = (prior_count_word_spam[i][word]+1)/(sum_of_words_spam[i] + sum_of_words_spam[i] - prior_count_word_spam[i][word]-1)

correct = [0, 0, 0]
wrong = [0, 0, 0]
for i, tokenized_ds in enumerate(tokenized_train_datasets):
  for mail, cat in tokenized_ds:
    c_ham = 0
    c_spam = 0
    for word in mail:
      if word in prior_prob_word_ham[i].keys():
        c_ham += np.log(prior_prob_ham[i]) + np.log(prior_prob_word_ham[i][word])
        c_spam += np.log(prior_prob_spam[i]) + np.log(prior_prob_word_spam[i][word])
    if((cat ==0 and c_ham > c_spam) or (cat == 1 and c_spam >= c_ham)):
      correct[i] += 1
    else:
      wrong[i] += 1

  print("Accuracy for training Dataset",i,":",correct[i]/(wrong[i]+correct[i]))

correct = [0, 0, 0]
wrong = [0, 0, 0]
tp = [0,0,0]
tn = [0,0,0]
fp = [0,0,0]
fn = [0,0,0]
print('')
for i, tokenized_ds in enumerate(tokenized_test_datasets):
  for mail, cat in tokenized_ds:
    c_ham = 0
    c_spam = 0
    for word in mail:
      if word in prior_prob_word_ham[i].keys():
        c_ham += np.log(prior_prob_ham[i]) + np.log(prior_prob_word_ham[i][word])
        c_spam += np.log(prior_prob_spam[i]) + np.log(prior_prob_word_spam[i][word])
    if((cat ==0 and c_ham > c_spam) or (cat == 1 and c_spam >= c_ham)):
      correct[i] += 1
      if(cat ==0):
        tn[i] += 1
      else:
        tp[i] += 1
    else:
      wrong[i] += 1
      if(cat ==0):
        fp[i] += 1
      else:
        fn[i] += 1
  print("\nFor Test Dataset",i,":-")
  print("Accuracy:",correct[i]/(wrong[i]+correct[i]))
  print("F1:",(2 * (tp[i]/(tp[i]+fp[i])) * (tp[i]/(tp[i]+fn[i]))) / ((tp[i]/(tp[i]+fp[i])) + (tp[i]/(tp[i]+fn[i]))))
  print("Precision:",tp[i]/(tp[i]+fp[i]))
  print("Recall:",tp[i]/(tp[i]+fn[i]))
  

Accuracy for training Dataset 0 : 0.9088888888888889
Accuracy for training Dataset 1 : 0.9071274298056156
Accuracy for training Dataset 2 : 0.9401869158878504


For Test Dataset 0 :-
Accuracy: 0.75
F1: 0.3804347826086956
Precision: 1.0
Recall: 0.2348993288590604

For Test Dataset 1 :-
Accuracy: 0.7656903765690377
F1: 0.24324324324324326
Precision: 1.0
Recall: 0.13846153846153847

For Test Dataset 2 :-
Accuracy: 0.8950276243093923
F1: 0.9320619785458879
Precision: 0.8727678571428571
Recall: 1.0


# Discrete Naive Bayes


---



In [None]:
from typing_extensions import final
from IPython.lib.display import IFrame
prior_prob_ham = [0,0,0]
prior_prob_spam = [0,0,0]
for i, dataset in enumerate(train_data_word_count_n_output):
  prior_prob_spam[i] = len([1 for count_dict, category in train_data_word_count_n_output[i] if category == 1])/len(train_data_word_count_n_output[i])
  prior_prob_ham[i] = 1 - prior_prob_spam[i]

prior_count_word_ham = [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)] 
prior_count_word_spam = [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)] 
for i, dataset in enumerate(train_data_word_count_n_output):
  for word in vocab[i]:
    for vect, cat in train_data_word_count_n_output[i]:
      if cat == 0:
        prior_count_word_ham[i][word] += 1 if vect[word] > 0 else 0
      else:
        prior_count_word_spam[i][word] += 1 if vect[word] > 0 else 0

sum_of_words_ham = [0,0,0]
sum_of_words_spam = [0,0,0]
for i, dataset in enumerate(train_data_word_count_n_output):
  for word,v in prior_count_word_ham[i].items(): 
    sum_of_words_ham[i] += v
  for word,v in prior_count_word_spam[i].items(): 
    sum_of_words_spam[i] += v

prior_prob_word_ham =  [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)] 
prior_prob_word_spam = [dict.fromkeys(vocab[0], 0), dict.fromkeys(vocab[1], 0), dict.fromkeys(vocab[2], 0)]
for i, dataset in enumerate(train_data_word_count_n_output):
  for word in vocab[i]:
    prior_prob_word_ham[i][word] = (prior_count_word_ham[i][word]+1)/(sum_of_words_ham[i] + sum_of_words_ham[i] -prior_count_word_ham[i][word] - 1)
    prior_prob_word_spam[i][word] = (prior_count_word_spam[i][word]+1)/(sum_of_words_spam[i] + sum_of_words_spam[i] - prior_count_word_spam[i][word]-1)

correct = [0, 0, 0]
wrong = [0, 0, 0]
for i, tokenized_ds in enumerate(tokenized_train_datasets):
  for mail, cat in tokenized_ds:
    c_ham = 0
    c_spam = 0
    for word in mail:
      if word in prior_prob_word_ham[i].keys():
        c_ham += np.log(prior_prob_ham[i]) + np.log(prior_prob_word_ham[i][word])
        c_spam += np.log(prior_prob_spam[i]) + np.log(prior_prob_word_spam[i][word])
    if((cat ==0 and c_ham > c_spam) or (cat == 1 and c_spam >= c_ham)):
      correct[i] += 1
    else:
      wrong[i] += 1

  print("Accuracy for training Dataset",i,":",correct[i]/(wrong[i]+correct[i]))

correct = [0, 0, 0]
wrong = [0, 0, 0]
tp = [0,0,0]
tn = [0,0,0]
fp = [0,0,0]
fn = [0,0,0]
for i, tokenized_ds in enumerate(tokenized_test_datasets):
  for mail, cat in tokenized_ds:
    c_ham = 0
    c_spam = 0
    for word in mail:
      if word in prior_prob_word_ham[i].keys():
        c_ham += np.log(prior_prob_ham[i]) + np.log(prior_prob_word_ham[i][word])
        c_spam += np.log(prior_prob_spam[i]) + np.log(prior_prob_word_spam[i][word])
    if((cat ==0 and c_ham > c_spam) or (cat == 1 and c_spam >= c_ham)):
      correct[i] += 1
      if(cat ==0):
        tn[i] += 1
      else:
        tp[i] += 1
    else:
      wrong[i] += 1
      if(cat ==0):
        fp[i] += 1
      else:
        fn[i] += 1

  print("\nFor Test Dataset",i,":-")
  print("Accuracy:",correct[i]/(wrong[i]+correct[i]))
  print("F1:",(2 * (tp[i]/(tp[i]+fp[i])) * (tp[i]/(tp[i]+fn[i]))) / ((tp[i]/(tp[i]+fp[i])) + (tp[i]/(tp[i]+fn[i]))))
  print("Precision:",tp[i]/(tp[i]+fp[i]))
  print("Recall:",tp[i]/(tp[i]+fn[i]))


Accuracy for training Dataset 0 : 0.7733333333333333
Accuracy for training Dataset 1 : 0.7688984881209503
Accuracy for training Dataset 2 : 0.9757009345794393

For Test Dataset 0 :-
Accuracy: 0.6842105263157895
F1: 0.06493506493506493
Precision: 1.0
Recall: 0.03355704697986577

For Test Dataset 1 :-
Accuracy: 0.7322175732217573
F1: 0.030303030303030307
Precision: 1.0
Recall: 0.015384615384615385

For Test Dataset 2 :-
Accuracy: 0.9134438305709024
F1: 0.9433051869722556
Precision: 0.8926940639269406
Recall: 1.0


# MCAP Logistic Regression

---

Making 1-D feature vector. Normalizing Bag of Words representation, so that the variables do not overflow

In [None]:
from sklearn.preprocessing import normalize

feature_vector_bow_train_datasets = [[],[],[]]
feature_vector_bernoulli_train_datasets = [[],[],[]]
train_data_set_op = [[],[],[]]

for i, dataset in enumerate(train_data_word_count_n_output):
  vocab_ordered = sorted(vocab[i])
  dataset_bow_ip = []
  dataset_bernoulli_ip = []
  dataset_op = []
  for data_dict, cat in dataset:
    feature_vector_bow = []
    feature_vector_bernoulli = []
    for word in vocab_ordered:
      feature_vector_bow.append(data_dict[word])
      feature_vector_bernoulli.append(1 if data_dict[word] != 0 else 0)
    dataset_bow_ip.append(normalize(np.array([feature_vector_bow])))
    dataset_bernoulli_ip.append([feature_vector_bernoulli])
    dataset_op.append(cat)
  
  feature_vector_bow_train_datasets[i] = dataset_bow_ip
  feature_vector_bernoulli_train_datasets[i] = dataset_bernoulli_ip
  train_data_set_op[i] = dataset_op


feature_vector_bow_test_datasets = [[],[],[]]
feature_vector_bernoulli_test_datasets = [[],[],[]]
test_data_set_op = [[],[],[]]

for i, dataset in enumerate(test_data_word_count_n_output):
  vocab_ordered = sorted(vocab[i])
  dataset_bow_ip = []
  dataset_bernoulli_ip = []
  dataset_op = []
  for data_dict, cat in dataset:
    feature_vector_bow = []
    feature_vector_bernoulli = []
    for word in vocab_ordered:
      feature_vector_bow.append(data_dict[word])
      feature_vector_bernoulli.append(1 if data_dict[word] != 0 else 0)
    dataset_bow_ip.append(normalize(np.array([feature_vector_bow])))
    dataset_bernoulli_ip.append([feature_vector_bernoulli])
    dataset_op.append(cat)
  
  feature_vector_bow_test_datasets[i] = dataset_bow_ip
  feature_vector_bernoulli_test_datasets[i] = dataset_bernoulli_ip
  test_data_set_op[i] = dataset_op


In [None]:
(len(feature_vector_bow_train_datasets[2]), len(train_data_set_op[2]))

(535, 535)

Making converting the data into Numpy arrays for ease of calculations.

In [None]:
np_feature_vector_bow_train_datasets = [[],[],[]]
np_feature_vector_bernoulli_train_datasets = [[],[],[]]
np_train_data_set_op = [[],[],[]]
np_feature_vector_bow_test_datasets = [[],[],[]]
np_feature_vector_bernoulli_test_datasets = [[],[],[]]
np_test_data_set_op = [[],[],[]]

for i in range(3):
  np_feature_vector_bow_train_datasets[i] = np.row_stack(feature_vector_bow_train_datasets[i])
  np_feature_vector_bernoulli_train_datasets[i] = np.row_stack(feature_vector_bernoulli_train_datasets[i])
  np_train_data_set_op[i] = np.array(train_data_set_op[i])
  np_feature_vector_bow_test_datasets[i] = np.row_stack(feature_vector_bow_test_datasets[i])
  np_feature_vector_bernoulli_test_datasets[i] = np.row_stack(feature_vector_bernoulli_test_datasets[i])
  np_test_data_set_op[i] = np.array(test_data_set_op[i])

print(np_feature_vector_bow_train_datasets[0].shape, np_train_data_set_op[0].shape)

(450, 9973) (450,)


Initialize the weights to 0 value vector. Number of weights would be number of features + 1, where the extra one is the bias. Here a model would be nothing but the weight values. Creating separate objects to represent Bag of words and Bernoulli models separately.

In [None]:
bow_model = [{"weights":np.array([], dtype = np.float128), "X":np.array([]), "y":np.array([])},
          {"weights":np.array([], dtype = np.float128), "X":np.array([]), "y":np.array([])},
          {"weights":np.array([], dtype = np.float128), "X":np.array([]), "y":np.array([])}]

for i in range(3):
  bow_model[i]["weights"] = np.ones(len(vocab[i])+1, dtype = np.float128) 
  bow_model[i]["X"] = np_feature_vector_bow_train_datasets[i]
  bow_model[i]["y"] = np_train_data_set_op[i]

In [None]:
ber_model = [{"weights":np.array([], dtype = np.float128), "X":np.array([]), "y":np.array([])},
          {"weights":np.array([], dtype = np.float128), "X":np.array([]), "y":np.array([])},
          {"weights":np.array([], dtype = np.float128), "X":np.array([]), "y":np.array([])}]

for i in range(3):
  ber_model[i]["weights"] = np.ones(len(vocab[i])+1, dtype = np.float128) 
  ber_model[i]["X"] = np_feature_vector_bernoulli_train_datasets[i]
  ber_model[i]["y"] = np_train_data_set_op[i]

In [None]:
def fit(model, delta,lamb):
  Pwk = np.zeros(len(model["X"]), dtype = np.float128)
  for i, Xk in enumerate(model["X"]):
    Xk = np.append([1], Xk)
    sum = np.dot(model["weights"], Xk)
    t1 = sum 
    t2 = np.log(1+np.exp(sum))
    Pwk[i] = np.exp(t1 - t2)
  
  arg1 = np.subtract(model["y"], Pwk)

  for i, w in enumerate(model["weights"]):
    if i != 0:
      dlw = np.dot(model["X"][:, i-1],np.transpose(arg1))
    else:
      dlw = np.sum(arg1)
    old = model["weights"][i]
    regularisation_term = np.array([delta],dtype = np.float128)*np.array([lamb],dtype = np.float128)*np.array([old],dtype = np.float128)*np.array([old],dtype = np.float128)
    model["weights"][i] += np.array([delta*dlw],dtype = np.float128) - regularisation_term

  return model

In [None]:
def fit_for_lasso(model, X, y, delta, lamb):
  Pwk = np.zeros(len(X), dtype = np.float128)
  for i, Xk in enumerate(X):
    Xk = np.append([1], Xk)
    sum = np.dot(model["weights"], Xk)
    t1 = sum 
    t2 = np.log(1+np.exp(sum))
    Pwk[i] = np.exp(t1 - t2)
  
  arg1 = np.subtract(y, Pwk)

  for i, w in enumerate(model["weights"]):
    if i != 0:
      dlw = np.dot(X[:, i-1],np.transpose(arg1))
    else:
      dlw = np.sum(arg1)

    old = model["weights"][i]
    regularisation_term = np.array([delta],dtype = np.float128)*np.array([lamb],dtype = np.float128)*np.array([old],dtype = np.float128)*np.array([old],dtype = np.float128)
    model["weights"][i] += delta*dlw - regularisation_term

  return model

In [None]:
def predict(model, X, y):
  correct, wrong = 0,0
  for i, Xk in enumerate(X):
    Xk = np.append([1], Xk)
    sum = np.dot(model["weights"], Xk)
    t1 = sum 
    t2 = np.log(1+np.exp(sum))
    p = np.exp(t1 - t2)

    if(p > 0.5 and y[i] == 1) or (p <= 0.5 and y[i] == 0): 
      correct += 1
    else:
      wrong += 1
  return correct/(correct+wrong)

In [None]:
from sklearn.model_selection import train_test_split
import copy
for i in range(3):
  X_train, X_test, y_train, y_test = train_test_split(bow_model[i]["X"], bow_model[i]["y"], test_size=0.30, random_state=1)
  bow_model_lasso =  bow_model[i]  
  lambda_n_acc_diff = []
  for l in range(0,11):
    bow_model_lasso = copy.deepcopy(bow_model[i])
    for iter in range(75):
      if iter%5 == 0:
        print(iter, end = ' ')
      bow_model_lasso = fit_for_lasso(bow_model_lasso, X_train, y_train, 0.01,l/10)
    print('')
    acc_tr = predict(bow_model_lasso, X_train, y_train)
    acc_ts = predict(bow_model_lasso, X_test, y_test)
    print("\n",acc_tr, acc_ts, abs(acc_tr-acc_ts), l/10)
    lambda_n_acc_diff.append([abs(acc_tr-acc_ts), l/10])
  print("Best lambda for model",i,":", min(lambda_n_acc_diff))


0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8571428571428571 0.8444444444444444 0.012698412698412653 0.0
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8539682539682539 0.8518518518518519 0.002116402116402072 0.1
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8539682539682539 0.8592592592592593 0.005291005291005346 0.2
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8507936507936508 0.8592592592592593 0.00846560846560851 0.3
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8380952380952381 0.8592592592592593 0.021164021164021163 0.4
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8412698412698413 0.8592592592592593 0.017989417989418 0.5
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.8253968253968254 0.8592592592592593 0.03386243386243393 0.6
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.7936507936507936 0.8518518518518519 0.05820105820105825 0.7
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.7968253968253968 0.8518518518518519 0.05502645502645509 0.8
0 5 10 15 20 25 30 35 40 4

In [None]:
lambda_vals = [0.1, 0.5, 0.1]
for i, model in enumerate(bow_model):
  print("--------Model "+str(i)+"---------")
  for iter in range(1, 201):
    bow_model[i] = fit(model, 0.01,lambda_vals[i])

    if(iter%50 == 0):
      acc1 = predict(model, model["X"], model["y"])
      acc2 = predict(model, np_feature_vector_bow_test_datasets[i], np_test_data_set_op[i])
      print("Iterations Done:", iter, end = '\t')
      print("Train Accuracy Dataset", i, ":", acc1, end = '\t')
      print("Test Accuracy Dataset", i, ":", acc2)
  

--------Model 0---------
Iterations Done: 50	Train Accuracy Dataset 0 : 0.84	Test Accuracy Dataset 0 : 0.7390350877192983
Iterations Done: 100	Train Accuracy Dataset 0 : 0.9066666666666666	Test Accuracy Dataset 0 : 0.7697368421052632
Iterations Done: 150	Train Accuracy Dataset 0 : 0.9311111111111111	Test Accuracy Dataset 0 : 0.8070175438596491
Iterations Done: 200	Train Accuracy Dataset 0 : 0.94	Test Accuracy Dataset 0 : 0.8157894736842105
--------Model 1---------
Iterations Done: 50	Train Accuracy Dataset 1 : 0.8358531317494601	Test Accuracy Dataset 1 : 0.7615062761506276
Iterations Done: 100	Train Accuracy Dataset 1 : 0.8855291576673866	Test Accuracy Dataset 1 : 0.7866108786610879
Iterations Done: 150	Train Accuracy Dataset 1 : 0.9222462203023758	Test Accuracy Dataset 1 : 0.8075313807531381
Iterations Done: 200	Train Accuracy Dataset 1 : 0.9265658747300216	Test Accuracy Dataset 1 : 0.8368200836820083
--------Model 2---------
Iterations Done: 50	Train Accuracy Dataset 2 : 0.8542056074

In [None]:
from sklearn.model_selection import train_test_split
import copy
for i in range(3):
  X_train, X_test, y_train, y_test = train_test_split(ber_model[i]["X"], ber_model[i]["y"], test_size=0.30, random_state=1)
  ber_model_lasso =  ber_model[i]  
  lambda_n_acc_diff = []
  for l in range(0,10):
    ber_model_lasso = copy.deepcopy(ber_model[i])
    for iter in range(75):
      if iter%5 == 0:
        print(iter, end = ' ')
      ber_model_lasso = fit_for_lasso(ber_model_lasso, X_train, y_train, 0.01,l/100)
    print('')
    acc_tr = predict(ber_model_lasso, X_train, y_train)
    acc_ts = predict(ber_model_lasso, X_test, y_test)
    print("\n",acc_tr, acc_ts, abs(acc_tr-acc_ts), l/100)
    lambda_n_acc_diff.append([abs(acc_tr-acc_ts), l/100])
  print("Best lambda for model",i,":", min(lambda_n_acc_diff))


0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8740740740740741 0.12275132275132272 0.0
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8740740740740741 0.12275132275132272 0.01
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8740740740740741 0.12275132275132272 0.02
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8740740740740741 0.12275132275132272 0.03
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8814814814814815 0.1153439153439153 0.04
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8888888888888888 0.107936507936508 0.05
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8888888888888888 0.107936507936508 0.06
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8962962962962963 0.10052910052910058 0.07
0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 

 0.9968253968253968 0.8962962962962963 0.10052910052910058 0.08
0 5 10 15 20 25 30 35 40 

In [None]:
lambda_vals = [0.07, 0.0, 0.07]
for i, model in enumerate(ber_model):
  print("--------Model "+str(i)+"---------")
  for iter in range(1, 201):
    ber_model[i] = fit(model, 0.01,lambda_vals[i])

    if(iter%50 == 0):
      acc1 = predict(model, model["X"], model["y"])
      acc2 = predict(model, np_feature_vector_bernoulli_test_datasets[i], np_test_data_set_op[i])
      print("Iterations Done:", iter, end = '\t')
      print("Train Accuracy Dataset", i, ":", acc1, end = '\t')
      print("Test Accuracy Dataset", i, ":", acc2)
  

--------Model 0---------
Iterations Done: 50	Train Accuracy Dataset 0 : 0.9866666666666667	Test Accuracy Dataset 0 : 0.8267543859649122
Iterations Done: 100	Train Accuracy Dataset 0 : 1.0	Test Accuracy Dataset 0 : 0.8530701754385965
Iterations Done: 150	Train Accuracy Dataset 0 : 1.0	Test Accuracy Dataset 0 : 0.8508771929824561
Iterations Done: 200	Train Accuracy Dataset 0 : 1.0	Test Accuracy Dataset 0 : 0.8464912280701754
--------Model 1---------
Iterations Done: 50	Train Accuracy Dataset 1 : 0.978401727861771	Test Accuracy Dataset 1 : 0.8368200836820083
Iterations Done: 100	Train Accuracy Dataset 1 : 1.0	Test Accuracy Dataset 1 : 0.8640167364016736
Iterations Done: 150	Train Accuracy Dataset 1 : 1.0	Test Accuracy Dataset 1 : 0.8702928870292888
Iterations Done: 200	Train Accuracy Dataset 1 : 1.0	Test Accuracy Dataset 1 : 0.8723849372384938
--------Model 2---------
Iterations Done: 50	Train Accuracy Dataset 2 : 0.994392523364486	Test Accuracy Dataset 2 : 0.861878453038674
Iterations Do

In [None]:
def predict_analysis(model, X, y):
  correct, wrong = 0,0
  tp,fp,tn,fn = 0,0,0,0
  for i, Xk in enumerate(X):
    Xk = np.append([1], Xk)
    sum = np.dot(model["weights"], Xk)
    t1 = sum 
    t2 = np.log(1+np.exp(sum))
    p = np.exp(t1 - t2)

    
    if(p > 0.5 and y[i] == 1) or (p <= 0.5 and y[i] == 0): 
      correct += 1
      if(y[i] == 0):
        tn += 1
      else:
        tp += 1
    else:
      wrong += 1
      if(y[i] ==0):
        fp += 1
      else:
        fn += 1
  return [correct,wrong,tp,fp,tn,fn]

In [None]:
for i in range(3):
  print("\nFor Bag of Words Test Dataset",i,":-")
  correct,wrong,tp,fp,tn,fn = predict_analysis(bow_model[i], np_feature_vector_bow_test_datasets[i], np_test_data_set_op[i])
  print("Accuracy:",correct/(wrong+correct))
  print("F1:",(2 * (tp/(tp+fp)) * (tp/(tp+fn))) / ((tp/(tp+fp)) + (tp/(tp+fn))))
  print("Precision:",tp/(tp+fp))
  print("Recall:",tp/(tp+fn))


For Bag of Words Test Dataset 0 :-
Accuracy: 0.8157894736842105
F1: 0.6557377049180327
Precision: 0.8421052631578947
Recall: 0.5369127516778524

For Bag of Words Test Dataset 1 :-
Accuracy: 0.8368200836820083
F1: 0.6285714285714286
Precision: 0.825
Recall: 0.5076923076923077

For Bag of Words Test Dataset 2 :-
Accuracy: 0.8858195211786372
F1: 0.9221105527638191
Precision: 0.9061728395061729
Recall: 0.9386189258312021


In [None]:
for i in range(3):
  print("\nFor Bernoulli Test Dataset",i,":-")
  correct,wrong,tp,fp,tn,fn = predict_analysis(ber_model[i], np_feature_vector_bernoulli_test_datasets[i], np_test_data_set_op[i])
  print("Accuracy:",correct/(wrong+correct))
  print("F1:",(2 * (tp/(tp+fp)) * (tp/(tp+fn))) / ((tp/(tp+fp)) + (tp/(tp+fn))))
  print("Precision:",tp/(tp+fp))
  print("Recall:",tp/(tp+fn))


For Bernoulli Test Dataset 0 :-
Accuracy: 0.8464912280701754
F1: 0.7107438016528926
Precision: 0.9247311827956989
Recall: 0.5771812080536913

For Bernoulli Test Dataset 1 :-
Accuracy: 0.8723849372384938
F1: 0.7239819004524888
Precision: 0.8791208791208791
Recall: 0.6153846153846154

For Bernoulli Test Dataset 2 :-
Accuracy: 0.861878453038674
F1: 0.8979591836734694
Precision: 0.9593023255813954
Recall: 0.8439897698209718


# SGD Classifier


---



In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
model = []
clfs = []
for i in range(3):
  print("For Bag of Words Test Dataset",i,":")
  model.append(SGDClassifier(max_iter=100))
  clfs.append(GridSearchCV(model[i], param_grid = {'penalty' : ['l1','l2']},scoring='accuracy', cv=10))
  clfs[i].fit(np_feature_vector_bow_train_datasets[i],np_train_data_set_op[i])
  print("Tuned Hyperparameters :", clfs[i].best_params_)
  y_pred = clfs[i].predict(np_feature_vector_bow_test_datasets[i])
  print("Accuracy:",clfs[i].score(np_feature_vector_bow_test_datasets[i], np_test_data_set_op[i]))
  print(precision_recall_fscore_support(np_test_data_set_op[i],y_pred,average='macro'))

For Bag of Words Test Dataset 0 :
Tuned Hyperparameters : {'penalty': 'l1'}
Accuracy: 0.9473684210526315
(0.9331853682028518, 0.952276851102901, 0.9415384615384614, None)
For Bag of Words Test Dataset 1 :
Tuned Hyperparameters : {'penalty': 'l2'}
Accuracy: 0.9288702928870293
(0.9101900972590629, 0.9101900972590629, 0.9101900972590629, None)
For Bag of Words Test Dataset 2 :
Tuned Hyperparameters : {'penalty': 'l1'}
Accuracy: 0.9779005524861878
(0.9805242272347536, 0.9645477184008615, 0.972134793020869, None)


In [None]:
model = []
clfs = []
for i in range(3):
  print("For Bernoulli Test Dataset",i,":")
  model.append(SGDClassifier(max_iter=100))
  clfs.append(GridSearchCV(model[i], param_grid = {'penalty' : ['l1','l2']},scoring='accuracy', cv=10))
  clfs[i].fit(np_feature_vector_bernoulli_train_datasets[i],np_train_data_set_op[i])
  print("Tuned Hyperparameters :", clfs[i].best_params_)
  y_pred = clfs[i].predict(np_feature_vector_bernoulli_test_datasets[i])
  print("Accuracy:",clfs[i].score(np_feature_vector_bernoulli_test_datasets[i], np_test_data_set_op[i]))
  print(precision_recall_fscore_support(np_test_data_set_op[i],y_pred,average='macro'))

For Bernoulli Test Dataset 0 :
Tuned Hyperparameters : {'penalty': 'l2'}
Accuracy: 0.9407894736842105
(0.9320915032679739, 0.9335745359945784, 0.9328266598285748, None)
For Bernoulli Test Dataset 1 :
Tuned Hyperparameters : {'penalty': 'l2'}
Accuracy: 0.9581589958158996
(0.9557469147172591, 0.9375331564986737, 0.9461129148629148, None)
For Bernoulli Test Dataset 2 :
Tuned Hyperparameters : {'penalty': 'l1'}
Accuracy: 0.9631675874769797
(0.9577830995552514, 0.9502961367613407, 0.9539440203562339, None)
