In [8]:
import glob
import os
from time import time
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import metrics,svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score


target_names = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23']

def get_files_in_dir(directory):
    file_names_list = []
    for file_name in glob.glob(directory):
        file_names_list += [file_name]
    return file_names_list


def get_text_file_data(file):
    data = []
    with open(file, 'r', encoding="latin-1") as file:
      for line in file:
        data += [str(line)]
    return data


def micro_macro_average(test_labels, test_pred):
    print("Percision_micro-average:   %0.2f" % precision_score(test_labels, test_pred, average='micro'))
    print("Percision_macro-average:   %0.2f" % precision_score(test_labels, test_pred, average='macro'))
    print("Recall_micro-average:   %0.2f" % recall_score(test_labels, test_pred, average='micro'))
    print("Recall_macro-average:   %0.2f" % recall_score(test_labels, test_pred, average='macro'))



def test_training_data():
   content = []
   labels = []

   for i in range(1,24):
     p = 'C'+'%.2d' % i
     file_path = os.path.join('ohsumed-all',str(p),'*')
     for file in get_files_in_dir(file_path):
        file_data = get_text_file_data(file)
        content += ["".join(file_data)]
        labels += [i]
   features_train, features_test, labels_train, labels_test = train_test_split(content, labels, test_size=0.35, random_state=10)
   
   # ntc
   vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False, max_df=0.5, stop_words='english')
   features_train = vectorizer.fit_transform(features_train)
   features_test = vectorizer.transform(features_test)

   # Feature selection
   selector = SelectPercentile(f_classif, percentile=5)
   selector.fit(features_train, labels_train)
   features_train = selector.transform(features_train)
   features_test = selector.transform(features_test)
   return features_train, features_test, labels_train, labels_test


# MultinomialNB
def multinomial_nb():
   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   t0 = time()
   naive_bayes_classifier = MultinomialNB()
   naive_bayes_classifier.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)
   # print(f"Training time: {round(time()-t0, 3)}s")

   t0 = time()
   test_pred = naive_bayes_classifier.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)
   # score_train = naive_bayes_classifier.score(content_t, labels_t)
   # print(f"Prediction time (train): {round(time()-t0, 3)}s")

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))

#BenoulliNB
def bernoulli_nb():
   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   t0 = time()
   naive_bayes_classifier = BernoulliNB()
   naive_bayes_classifier.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)
   # print(f"Training time: {round(time()-t0, 3)}s")

   t0 = time()
   test_pred = naive_bayes_classifier.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)
   # score_train = naive_bayes_classifier.score(content_t, labels_t)
   # print(f"Prediction time (train): {round(time()-t0, 3)}s")

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))

#Knn with k=1
def knn1_classifier():

   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   t0 = time()
   knn = KNeighborsClassifier(n_neighbors=1)
   knn.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)

   t0 = time()
   test_pred = knn.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))


#Knn with k=3
def knn3_classifier():
   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   t0 = time()
   knn = KNeighborsClassifier(n_neighbors=3)
   knn.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)

   t0 = time()
   test_pred = knn.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))


#knn with k=5
def knn5_classifier():
   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   # k=5
   t0 = time()
   knn = KNeighborsClassifier(n_neighbors=5)
   knn.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)

   t0 = time()
   test_pred = knn.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))


#SVM with linear kernel
def linear_svm():

   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   t0 = time()
   svc = svm.SVC(kernel ='linear', C = 1)
   svc.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)

   t0 = time()
   test_pred = svc.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))


# SVM with gaussian kernel
def rbf_svm():

   trg = test_training_data()
   content_t = trg[0]
   test_content = trg[1]
   labels_t = trg[2]
   test_labels = trg[3]

   t0 = time()
   svc = svm.SVC(kernel ='rbf', C = 0.4)
   svc.fit(content_t, labels_t)

   training_time = time() - t0
   print("train time: %0.3fs" % training_time)

   t0 = time()
   test_pred = svc.predict(test_content)
   test_time = time() - t0
   print("test time:  %0.3fs" % test_time)


   accuracy = metrics.accuracy_score(test_labels, test_pred)
   print("accuracy:   %0.3f" % accuracy)

   print(metrics.classification_report(test_labels, test_pred,
       target_names = target_names))

   print(micro_macro_average(test_labels,test_pred))

   # print("confusion matrix:")
   # print(metrics.confusion_matrix(test_labels, test_pred))



#All needed functions:

multinomial_nb()
# bernoulli_nb()
# knn1_classifier()
# knn3_classifier()
# knn5_classifier()
# linear_svm()
# rbf_svm()

samira   (0, 42787)	0.0756866790121593
  (0, 17656)	0.052107322261785244
  (0, 34626)	0.06296860692073819
  (0, 50256)	0.06508748560359351
  (0, 14842)	0.06232012004548747
  (0, 45022)	0.04576779456588032
  (0, 33470)	0.0614172967952831
  (0, 17392)	0.04773445123264238
  (0, 48685)	0.07562464385443352
  (0, 21394)	0.0640757987289428
  (0, 55698)	0.059454384807373285
  (0, 50646)	0.05362094721928437
  (0, 39027)	0.0351863503077323
  (0, 15964)	0.08019600540204529
  (0, 6819)	0.0361411763425358
  (0, 2834)	0.0701814508257512
  (0, 2597)	0.06790274670175973
  (0, 5150)	0.09232321746469292
  (0, 52945)	0.04207227627750221
  (0, 50202)	0.06719055677234913
  (0, 12418)	0.051005351330109035
  (0, 54929)	0.05796784822637559
  (0, 51726)	0.04115540088634516
  (0, 5715)	0.07189743497339073
  (0, 7554)	0.06770808453903296
  :	:
  (0, 26518)	0.10941718893359764
  (0, 39035)	0.06682284633846308
  (0, 10082)	0.11773020586257094
  (0, 3010)	0.13645449233016066
  (0, 3651)	0.08192974695696721
  (0, 48

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
