# Part of Speech Tagger:

### By: Sabah Ibrahim

#### We will be implementing the tagger using two machine learning models that we are familar with: Naïve Bayes and Decision Tree classifier. We will later be accessing the models using precision, recall, accuracy, and F-measure.

In [5]:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.24.1-cp39-cp39-macosx_10_13_x86_64.whl (7.3 MB)
[K     |████████████████████████████████| 7.3 MB 8.4 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Collecting joblib>=0.11
  Downloading joblib-1.0.1-py3-none-any.whl (303 kB)
[K     |████████████████████████████████| 303 kB 8.3 MB/s eta 0:00:01
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=7a47303d881d0c15b4f477b56fac80f5aa1621448409dc3f8720f6807a8d93b9
  Stored in directory: /Users/sabahibrahim/Library/Caches/pip/wheels/e4/7b/98/b6466d71b8d738a0c547008b9eb39bf8676d1ff6ca4b22af1c
Successfully built sklearn
Installing collected packages: threadpoolctl, joblib, scikit-learn, sklearn
Successfully installed jobli

In [13]:
# Import statements
import numpy as np
import pandas as pd
import scipy
import warnings
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')

In [14]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    return f1, precision, recall, accuracy

In [15]:
def labelCoder():
    # load dataset and using LabelEncoder on it
    df = pd.read_csv('../DATA/pos-eng-5000.data.csv')
    labels = np.asarray(df['class'].astype("category").cat.codes.tolist())
    X_vals = df.drop(columns=['class']).values
    le = preprocessing.LabelEncoder()
    le.fit(X_vals.ravel())
    X = le.transform(X_vals.ravel())
    X = X.reshape(5000, -1)
    return X, labels

def oneHotcoder():
    # load dataset and using onehotencoder on it
    df = pd.read_csv('../DATA/pos-eng-5000.data.csv')
    labels = np.asarray(df['class'].astype("category").cat.codes.tolist())
    X_vals = df.drop(columns=['class']).values
    OneEncoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    X = OneEncoder.fit_transform(X_vals)
    return X, labels

In [18]:
print(X)

  (0, 524)	1.0
  (0, 2372)	1.0
  (0, 4303)	1.0
  (0, 6275)	1.0
  (0, 8463)	1.0
  (0, 11208)	1.0
  (0, 12238)	1.0
  (1, 524)	1.0
  (1, 2372)	1.0
  (1, 4243)	1.0
  (1, 6514)	1.0
  (1, 9338)	1.0
  (1, 10408)	1.0
  (1, 11921)	1.0
  (2, 524)	1.0
  (2, 2314)	1.0
  (2, 4483)	1.0
  (2, 7389)	1.0
  (2, 8507)	1.0
  (2, 10077)	1.0
  (2, 12029)	1.0
  (3, 469)	1.0
  (3, 2542)	1.0
  (3, 5358)	1.0
  (3, 6558)	1.0
  :	:
  (4996, 6336)	1.0
  (4996, 8799)	1.0
  (4996, 10077)	1.0
  (4996, 12029)	1.0
  (4997, 1558)	1.0
  (4997, 2960)	1.0
  (4997, 4305)	1.0
  (4997, 6850)	1.0
  (4997, 8154)	1.0
  (4997, 10192)	1.0
  (4997, 12029)	1.0
  (4998, 1092)	1.0
  (4998, 2374)	1.0
  (4998, 4819)	1.0
  (4998, 6180)	1.0
  (4998, 8283)	1.0
  (4998, 10192)	1.0
  (4998, 12029)	1.0
  (4999, 524)	1.0
  (4999, 2372)	1.0
  (4999, 4303)	1.0
  (4999, 6057)	1.0
  (4999, 8648)	1.0
  (4999, 11264)	1.0
  (4999, 13246)	1.0


In [16]:
# Construct the classifiers at hand prior to folding the data through them
X, labels = labelCoder()
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), 
               DecisionTreeClassifier(random_state=0)]
print("Using Label Encoder to encode data")
for name, clf in zip(names, classifiers):
    print('\tNow classifying', name)

    # Fold the data 5 times
    kf = KFold(n_splits = 5)
    foldCounter = 0
    aList, bList, cList, dList = list(), list(), list(), list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        f1, precision, recall, accuracy = buildClassifiers(clf, X_train, X_test, y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)
        dList.append(accuracy)

   
    print("\t\tAverage F1 for {}:\t\t\t".format(name), np.mean(aList))
    print("\t\tAverage Precision for {}:\t\t".format(name), np.mean(bList))
    if (name == 'Decision_Tree'):
        print("\t\tAverage Recall for {}:\t\t".format(name), np.mean(cList))
    else:
        print("\t\tAverage Recall for {}:\t\t\t".format(name), np.mean(cList))
    print("\t\tAverage Accuracy for {}:\t\t".format(name), np.mean(dList))

print('\n')
X, labels = oneHotcoder()
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), 
               DecisionTreeClassifier(random_state=0)]
print("Using One Hot Encoder to encode data")
for name, clf in zip(names, classifiers):

    print('\tNow classifying', name)

    # Fold the data 5 times
    kf = KFold(n_splits = 5)
    foldCounter = 0
    aList, bList, cList, dList = list(), list(), list(), list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        f1, precision, recall, accuracy = buildClassifiers(clf, X_train.toarray(), X_test.toarray(), y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)
        dList.append(accuracy)

    print("\t\tAverage F1 for {}:\t\t\t".format(name), np.mean(aList))
    print("\t\tAverage Precision for {}:\t\t".format(name), np.mean(bList))
    if (name == 'Decision_Tree'):
        print("\t\tAverage Recall for {}:\t\t".format(name), np.mean(cList))
    else:
        print("\t\tAverage Recall for {}:\t\t\t".format(name), np.mean(cList))
    print("\t\tAverage Accuracy for {}:\t\t".format(name), np.mean(dList))

Using Label Encoder to encode data
	Now classifying Naive_Bayes
		Average F1 for Naive_Bayes:			 0.1971408187484808
		Average Precision for Naive_Bayes:		 0.2093715178546595
		Average Recall for Naive_Bayes:			 0.22279109863401877
		Average Accuracy for Naive_Bayes:		 0.3302
	Now classifying Decision_Tree
		Average F1 for Decision_Tree:			 0.5290256210488556
		Average Precision for Decision_Tree:		 0.5412105896978388
		Average Recall for Decision_Tree:		 0.5529607536411726
		Average Accuracy for Decision_Tree:		 0.6068


Using One Hot Encoder to encode data
	Now classifying Naive_Bayes
		Average F1 for Naive_Bayes:			 0.3319224059618128
		Average Precision for Naive_Bayes:		 0.35597903480660686
		Average Recall for Naive_Bayes:			 0.35709881540400923
		Average Accuracy for Naive_Bayes:		 0.479
	Now classifying Decision_Tree
		Average F1 for Decision_Tree:			 0.5701112958599887
		Average Precision for Decision_Tree:		 0.6320625345485358
		Average Recall for Decision_Tree:		 0.5497331072