# Logistic Regression for POS tagging (with hypertuning)

In [16]:
# Loading Packages
import os, random, datetime
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV

# automatic module reloading
%load_ext autoreload
%autoreload 2

# For reproducibility
np.random.seed(3) 
random.seed(3)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# add parent directory to path for imports to work
import sys
sys.path.append('..')

# src imports
from src.utils import get_root_dir
from src.parser import format_data, embeddings_init
from src.data_helpers import vectorize, preprocess_unlabelled_test_data

In [3]:
# dataset helper functions 
def dataset_init():
    dataset_path = os.path.join(get_root_dir(), 'dataset')
    train_data_path = os.path.join(dataset_path, "train.txt")
    test_data_path = os.path.join(dataset_path, "test.txt")
    labelled_test_data_path =  os.path.join(dataset_path, "test_labelled.txt")
    # ==++++++++++++Rewrite+++++++++++==
    if not os.path.exists(train_data_path) or not os.path.exists(test_data_path) or not\
            os.path.exists(labelled_test_data_path):
        raise FileNotFoundError("Check dataset paths!")
    return train_data_path, test_data_path, labelled_test_data_path

In [4]:
# training/testing helper functions
def test_set_predictions(model, preprocessed_test_data, test_sentences):
    arg_max_dict = []
    for sentence in preprocessed_test_data:
        predict_x = model.predict(sentence)
        # predict_x = np.argmax(predict_x, axis=0)
        arg_max_dict.append(predict_x)

    predicted_data = []
    for index in range(len(test_sentences)):
        predicted_sen = list(zip(test_sentences[index], arg_max_dict[index]))
        predicted_data.append(predicted_sen)

    return predicted_data, arg_max_dict

def compare_with_test_set(predicted_data, correct_set):
    total = 0
    correct = 0
    for predicted_sentence, correct_sentence in zip(predicted_data, correct_set):
        for predicted_word, correct_word in zip(predicted_sentence, correct_sentence):
            total = total + 1
            if predicted_word[1] == correct_word[1]:
                correct = correct + 1

    accuracy = (correct / total) * 100
    return accuracy

### Embeddings

In [12]:
# Initialize Embeddings
embeddings_path = os.path.join(get_root_dir(), 'dataset')
t_ini = datetime.datetime.now()
print('Initializing embeddings')
embeddings = embeddings_init(str(embeddings_path))
print(str(embeddings_path))
print('Initialiation completed')
t_fin = datetime.datetime.now()
print('Embeddings loaded in {} seconds'.format((t_fin - t_ini).total_seconds()))

# Global constants for feature engineering
dim = embeddings.vectors.shape[1]
pad = np.zeros(dim)  # Pad vector
oov = np.random.uniform(-0.25, 0.25, dim)  # Out-of-vocabulary vector

Initializing embeddings
inside embeddings_init
inside embeddings_init
D:\Salika\Masters\PennStateMSCSE\Coursework\Spring-2023\CSE582\P1\CSE-582\dataset
Initialiation completed
Embeddings loaded in 476.595276 seconds


### Training 

In [24]:
def hyper_tuning(X_train, y_train, scores, estimator, parameters, cv, model_filename='lr-model-ht1.pkl'):
    print("# Estimator:",estimator)
    
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        
        clf = GridSearchCV(estimator, parameters, cv=cv, scoring='%s' % score)
        
        print("Initializing training")
        
        clf.fit(X_train, y_train)
        
        print("training complete")
        
        print("Best parameters set found on development set:")
        print(clf.best_params_)
        
        print("Grid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        
        # save model
        print('Saving model...')
        with open(model_filename, 'wb') as file:
            pickle.dump(clf, file)
        
        return clf
        

In [25]:
def init_training_with_cross_validation(X_train, y_train, filename):
    t_ini = datetime.datetime.now()
    print('Training...')
    lr_model = LogisticRegression(solver='liblinear', multi_class='auto', random_state=2)
    # what is this?
#     skf = StratifiedKFold(n_splits=5, random_state=1)
    skf = StratifiedKFold(n_splits=5)
    scores = ['accuracy'] # can add scores like 'f1_macro', 'precision', 'recall'
    
    params = [{'C': [0.1, 1, 2, 3]}] # params for C
    
    lr_model = hyper_tuning(X_train, y_train, scores, lr_model, params, skf, filename)
    t_fin = datetime.datetime.now()
    print('Training completed in {} seconds'.format((t_fin - t_ini).total_seconds()))

In [14]:
# Initialize Datasets
train_path, test_path, labelled_test_path  = dataset_init()

# Preprocessing on training dataset
train_sentences = format_data(train_path)
#
print("Tagged sentences in train set: ", len(train_sentences))
print("Tagged words in train set:", len([item for sublist in train_sentences for item in sublist]))
#
t_ini = datetime.datetime.now()
print('Initializing vectorization...')
X_train, y_train = vectorize(embeddings, oov, train_sentences, window=1)
print('Completed vectorization...')
t_fin = datetime.datetime.now()
print('Vectorization completed in {} seconds'.format((t_fin - t_ini).total_seconds()))

Tagged sentences in train set:  8936
Tagged words in train set: 211727
Initializing vectorization...
Embeddings window method
Vectorizing Dataset...
Vectorizing train...
Dataset vectorized.
Train shape: (211727, 900)
Completed vectorization...
Vectorization completed in 21.642678 seconds


In [27]:
init_training_with_cross_validation(X_train, y_train, filename='lr-model-ht1.pkl')

# Estimator: LogisticRegression(random_state=2, solver='liblinear')
# Tuning hyper-parameters for accuracy
Initializing training
training complete
Best parameters set found on development set:
{'C': 3}
Grid scores on development set:
0.936 (+/-0.002) for {'C': 0.1}
0.943 (+/-0.003) for {'C': 1}
0.943 (+/-0.003) for {'C': 2}
0.943 (+/-0.003) for {'C': 3}
Saving model...


### Evaluation

In [28]:
def load_model(model_filename):
    with open(model_filename, 'rb') as file:
        Pickled_LR_Model = pickle.load(file)

    return Pickled_LR_Model

In [29]:
# Load Model
clf = load_model(model_filename='lr-model-ht1.pkl')
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

correct_test_sen = format_data(labelled_test_path)
test_sentences = format_data(test_path, False)


preprocessed_test_data = preprocess_unlabelled_test_data(embeddings, oov, test_sentences)
# compute predictions for the test data
predicted_data, arg_max_dict = test_set_predictions(clf, preprocessed_test_data, test_sentences)

In [30]:
# Results
print(compare_with_test_set(predicted_data, correct_test_sen))  # Accuracy = 94
print(predicted_data[:2])

with open('output-ht.txt', 'w') as f:
    f.write(str(predicted_data))

94.45300462249615
[[('rockwell', 'NNP'), ('international', 'NNP'), ('corp.', 'NNP'), ("'s", 'POS'), ('tulsa', 'NNS'), ('unit', 'NNP'), ('said', 'VBD'), ('it', 'PRP'), ('signed', 'VBD'), ('a', 'DT'), ('tentative', 'JJ'), ('agreement', 'NN'), ('extending', 'VBG'), ('its', 'PRP$'), ('contract', 'NN'), ('with', 'IN'), ('boeing', 'NNP'), ('co.', 'NNP'), ('to', 'TO'), ('provide', 'VB'), ('structural', 'JJ'), ('parts', 'NNS'), ('for', 'IN'), ('boeing', 'NNP'), ("'s", 'POS'), ('747', 'NN'), ('jetliners', 'NNS'), ('.', '.')], [('rockwell', 'NNP'), ('said', 'VBD'), ('the', 'DT'), ('agreement', 'NN'), ('calls', 'VBZ'), ('for', 'IN'), ('it', 'PRP'), ('to', 'TO'), ('supply', 'VB'), ('200', 'CD'), ('additional', 'JJ'), ('so-called', 'JJ'), ('shipsets', '``'), ('for', 'IN'), ('the', 'DT'), ('planes', 'NNS'), ('.', '.')]]
