<a href="https://colab.research.google.com/github/sabih411/POS-Tagging/blob/main/POS_TAGGING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Loading Libraries

In [None]:
import numpy as np
import random
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from gensim.models import FastText
import pickle
from sklearn.metrics import accuracy_score

#Date-Preprocessing

## Reading Train Data

In [None]:
!gzip -d /content/train.txt.gz

In [None]:
DATA = []
word_pos_pair = []
with open("train.txt", "r") as f_open:
    for line in f_open:
        line = line.strip()
        if line:
            s = line.split(" ")
            word = s[0]
            token = s[1]
            word_pos_pair.append((word, token))
        else:
            if word_pos_pair:
                DATA.append(word_pos_pair)
                word_pos_pair= []

if word_pos_pair:
    DATA.append(word_pos_pair)
random.shuffle(DATA)
split_index = int(0.8 * len(DATA))
Data_train = DATA[:split_index]
Data_test = DATA[split_index:]
print("total data sentences", len(DATA))

###################################
#FAST CHECK MODEL DATA (VECT_TRAIN/VECT_TEST)
##################################
vect_train=[]
for i in Data_train:
  small=[]
  y=[]
  for j in i:
    small.append(j[0])
  vect_train.append(small)
vect_test=[]
for i in Data_test:
  small=[]
  y=[]
  for j in i:
    small.append(j[0])
  vect_test.append(small)

total data sentences 8936


## Customized feature Engineering

In [None]:
def get_feature(token, token_index, sent):
    pre_word=['the','The','THE','tHe','ThE','thE','THe','A','AN','a','an','aN']
    token_feature = {
        'token'             : token,
        'is_first'          : token_index == 0,
        'is_last'           : token_index == len(sent)-1,

        'is_capitalized'    : token[0].upper() == token[0],
        'is_all_capitalized': token.upper() == token,
        'is_capitals_inside': token[1:].lower() != token[1:],
        'is_numeric'        : token.isdigit(),

        'prefix-1'          : token[0],
        'prefix-2'          : '' if len(token) < 2  else token[:2],
        'suffix-1'          : token[-1],
        'suffix-2'          : '' if len(token) < 2  else token[-2:],

        'prev-token'        : '' if token_index == 0     else sent[token_index - 1][0],
        '2-prev-token'      : '' if token_index <= 1     else sent[token_index - 2][0],


        'is prev-token the-an-a' : '' if token_index==0  else sent[token_index-1] in pre_word,
        'next-token'        : '' if token_index == len(sent) - 1     else sent[token_index + 1][0],
        '2-next-token'      : '' if token_index >= len(sent) - 2     else sent[token_index + 2][0]
        }
    return  token_feature

def get_feature_word(token, token_index, sent):
    pre_word = {'the', 'a', 'an'}
    def word_shape(tk):
        return ''.join(['x' if ch.isalpha() and ch.islower()
                        else 'X' if ch.isalpha() and ch.isupper()
                        else 'd' if ch.isdigit()
                        else ch for ch in tk])
    def vowel_consonant_count(tk):
        vowels = sum(1 for ch in tk if ch.lower() in 'aeiou')
        consonants = sum(1 for ch in tk if ch.isalpha() and ch.lower() not in 'aeiou')
        return vowels, consonants
    v_count, c_count = vowel_consonant_count(token)
    token_feature = {

        'token_length': len(token),
        'word_shape': word_shape(token),
        'has_hyphen': '-' in token,
        'num_vowels': v_count,
        'num_consonants': c_count,
    }

    return token_feature

def form_data(all_sentences):
    features   = []
    pos_labels = []
    for sent in all_sentences:


        for token_index, token_pair in enumerate(sent):
            token  = token_pair[0]
            features.append(get_feature(token, token_index, sent))
            pos_label = token_pair[1]
            pos_labels.append(pos_label)

    return features, pos_labels


def form_data_just_words(all_sentences):
    features   = []
    pos_labels = []
    for sent in all_sentences:


        for token_index, token_pair in enumerate(sent):
            token  = token_pair[0]
            features.append(get_feature_word(token, token_index, sent))
            pos_label = token_pair[1]
            pos_labels.append(pos_label)

    return features, pos_labels

def form_data_vect(all_sentences):
    features   = []
    pos_labels = []
    model=FastText.load("/content/drive/MyDrive/POS_TAGS_WEIGHT/word2vec.model")
    for sent in all_sentences:
        for token_index, token_pair in enumerate(sent):
            token  = token_pair[0]
            features.append(model.wv[token])
            pos_label = token_pair[1]
            pos_labels.append(pos_label)

    return features, pos_labels

#UNCOMMENT TO TRAIN GENSIM VECTORIZER
# model =  FastText(vect_train, vector_size=50, window=3, min_count=1, sg=1)
# model.save("/content/drive/MyDrive/POS_TAGS_WEIGHT/word2vec.model")


## Loading Models

In [None]:
##################
#MAKING THE DATA FRAME
##################
big_f=form_data(Data_train)
small_f=form_data_just_words(Data_train)
vect_f=form_data_vect(Data_train)

vectorizer_big_f = DictVectorizer()
vectorizer_big_f.fit(big_f[0])
vectorized_features_big_f = vectorizer_big_f.transform(big_f[0])

vectorizer_small_f = DictVectorizer()
vectorizer_small_f.fit(small_f[0])
vectorized_features_small_f= vectorizer_small_f.transform(small_f[0])

##############################
#INITIALIZIING MODELS
#############################
model_lr= LogisticRegression(max_iter=100)
model_svm = SVC(max_iter=300,probability=True)
model_nb =  MultinomialNB()

model_nb_small= MultinomialNB()

model_lr_vect=LogisticRegression(max_iter=100)

#Training

In [None]:
##################################
#TRAINING
##################################
model_lr.fit(vectorized_features_big_f, big_f[1])            #model1
model_svm.fit(vectorized_features_big_f, big_f[1])           #model2
model_nb.fit(vectorized_features_big_f, big_f[1])            #model3

model_nb_small.fit(vectorized_features_small_f, small_f[1])  #model4

model_lr_vect.fit(vect_f[0], vect_f[1])                      #model5

## Saving Models

In [None]:
#####################################################
#SAVING MODELS
#####################################################
filename = 'model_svm.pickle'
pickle.dump(model_svm, open(filename, 'wb'))

filename = 'model_lr.pickle'
pickle.dump(model_lr, open(filename, 'wb'))

filename = 'model_nb.pickle'
pickle.dump(model_nb, open(filename, 'wb'))

filename = 'model_nb_small.pickle'
pickle.dump(model_nb_small, open(filename, 'wb'))

filename = 'model_lr_vect.pickle'
pickle.dump(model_lr_vect, open(filename, 'wb'))
################################################################

# Models Evaluation

In [None]:
#####################################
# PREDICTIONS
####################################

big_f_test=form_data(Data_test)
small_f_test=form_data_just_words(Data_test)
vect_f_test=form_data_vect(Data_test)

predicted_labels_model1  = model_lr.predict(vectorizer_big_f.transform(big_f_test[0]))
predicted_probs_model1   = model_lr.predict_proba(vectorizer_big_f.transform(big_f_test[0]))
acc_score_model1 = accuracy_score(big_f_test[1], predicted_labels_model1)

predicted_labels_model2  = model_svm.predict(vectorizer_big_f.transform(big_f_test[0]))
predicted_probs_model2   = model_svm.predict_proba(vectorizer_big_f.transform(big_f_test[0]))
acc_score_model2 = accuracy_score(big_f_test[1], predicted_labels_model2)

predicted_labels_model3  = model_nb.predict(vectorizer_big_f.transform(big_f_test[0]))
predicted_probs_model3   = model_nb.predict_proba(vectorizer_big_f.transform(big_f_test[0]))
acc_score_model3 = accuracy_score(big_f_test[1],predicted_labels_model3)

predicted_labels_model4  = model_nb_small.predict(vectorizer_small_f.transform(small_f_test[0]))
predicted_probs_model4   = model_nb_small.predict_proba(vectorizer_small_f.transform(small_f_test[0]))
acc_score_model4= accuracy_score(small_f_test[1], predicted_labels_model4)

predicted_labels_model5  = model_lr_vect.predict(vect_f_test[0])
predicted_probs_model5   = model_lr_vect.predict_proba(vect_f_test[0])
acc_score_model5= accuracy_score(vect_f_test[1],predicted_labels_model5)

In [None]:
print(acc_score_model1,acc_score_model2,acc_score_model3,acc_score_model4,acc_score_model5)

0.9663499976484974 0.9321591496966561 0.9118186521187038 0.5312514696891314 0.7792409349574378


## Majority Voting Method

In [None]:
#######################################################
#COMBINING ALL MODELS (MAJORITY VOTING)
#######################################################
final_predictions=[]
import numpy as np
for predictions in zip(predicted_labels_model1, predicted_labels_model2, predicted_labels_model3, predicted_labels_model4, predicted_labels_model5):

    array=np.array(predictions)
    b=np.unique(array,return_counts=True)
    #print(b[0][np.argmax(b[1])])
    final_predictions.append(b[0][np.argmax(b[1])])

acc_score_model5= accuracy_score(vect_f_test[1],final_predictions)
print(acc_score_model5)

0.9349104077505526


## Best Probabilistic Difference Method

In [None]:
#######################################################
#COMBINING ALL MODELS (BEST PROBABILISTIC DIFFERENCE)
#######################################################

model1=[]
for j in range(len(predicted_probs_model1)):
  model1.append(abs(sorted(predicted_probs_model1[j])[-1] - sorted(predicted_probs_model1[j])[-2]))

model2=[]
for j in range(len(predicted_probs_model2)):
  model2.append(abs(sorted(predicted_probs_model2[j])[-1] - sorted(predicted_probs_model2[j])[-2]))

model3=[]
for j in range(len(predicted_probs_model3)):
  model3.append(abs(sorted(predicted_probs_model3[j])[-1] - sorted(predicted_probs_model3[j])[-2]))

model4=[]
for j in range(len(predicted_probs_model4)):
  model4.append(abs(sorted(predicted_probs_model4[j])[-1] - sorted(predicted_probs_model4[j])[-2]))

model5=[]
for j in range(len(predicted_probs_model5)):
  model5.append(abs(sorted(predicted_probs_model5[j])[-1] - sorted(predicted_probs_model5[j])[-2]))

print(len(model1),len(model2),len(model3),len(model4),len(model5))
i=0
final_predictions=[]
for max_prob in zip(model1,model2,model3,model4,model5):
    arr=max_prob
    index = arr.index(max(arr))

    if index==0:
      final_predictions.append(predicted_labels_model1[i])
    if index==1:
      final_predictions.append(predicted_labels_model2[i])
    if index==2:
      final_predictions.append(predicted_labels_model3[i])
    if index==3:
      final_predictions.append(predicted_labels_model4[i])
    if index==4:
      final_predictions.append(predicted_labels_model5[i])
    i+=1

acc_score_model5= accuracy_score(vect_f_test[1],final_predictions)
print(acc_score_model5)

0.9459859850444434


#TESTING NEW DATA

## Reading Test Data

In [None]:
test_sentences=[]
sentence=[]
tag=[]
with open("/content/unlabeled_test_test.txt", "r") as f_open:
    for line in f_open:
        line = line.strip()
        if line:
            s = line
            sentence.append(s)
        else:
            if sentence:
                test_sentences.append(sentence)
                sentence = []
                tag = []
if sentence:
    test_sentences.append(sentence)

In [None]:
test_sentences_modified=[]
for i in test_sentences:
  small=[]
  for j in i:
    small.append((j,'NNP'))
  test_sentences_modified.append(small)

## Loading saved Models

In [None]:
#######################################
#LOAD MODELS
#######################################
model_lr= LogisticRegression(max_iter=100)
model_svm = SVC(max_iter=300,probability=True)
model_nb =  MultinomialNB()

model_nb_small= MultinomialNB()

model_lr_vect=LogisticRegression(max_iter=100)

model_lr = pickle.load(open("/content/drive/MyDrive/POS_TAGS_WEIGHT/model_lr.pickle", "rb"))
model_svm = pickle.load(open("/content/drive/MyDrive/POS_TAGS_WEIGHT/model_svm.pickle", "rb"))
model_nb = pickle.load(open("/content/drive/MyDrive/POS_TAGS_WEIGHT/model_nb.pickle", "rb"))
model_nb_small = pickle.load(open("/content/drive/MyDrive/POS_TAGS_WEIGHT/model_nb_small.pickle", "rb"))
model_lr_vect = pickle.load(open("/content/drive/MyDrive/POS_TAGS_WEIGHT/model_lr_vect.pickle", "rb"))

## Final Evaluation

In [None]:
##################################################
#Final Evaluation
#Please note to run this code it's important to have run the train code before because dict vectorizers are defined under section: Loading Models)
##################################################

big_f_test=form_data(test_sentences_modified)
small_f_test=form_data_just_words(test_sentences_modified)
vect_f_test=form_data_vect(test_sentences_modified)

predicted_labels_model1  = model_lr.predict(vectorizer_big_f.transform(big_f_test[0]))
predicted_probs_model1   = model_lr.predict_proba(vectorizer_big_f.transform(big_f_test[0]))

predicted_labels_model2  = model_svm.predict(vectorizer_big_f.transform(big_f_test[0]))
predicted_probs_model2   = model_svm.predict_proba(vectorizer_big_f.transform(big_f_test[0]))

predicted_labels_model3  = model_nb.predict(vectorizer_big_f.transform(big_f_test[0]))
predicted_probs_model3   = model_nb.predict_proba(vectorizer_big_f.transform(big_f_test[0]))

predicted_labels_model4  = model_nb_small.predict(vectorizer_small_f.transform(small_f_test[0]))
predicted_probs_model4   = model_nb_small.predict_proba(vectorizer_small_f.transform(small_f_test[0]))

predicted_labels_model5  = model_lr_vect.predict(vect_f_test[0])
predicted_probs_model5   = model_lr_vect.predict_proba(vect_f_test[0])

In [None]:
#######################################################
#COMBINING ALL MODELS (BEST PROBABILISTIC DIFFERENCE)
#######################################################

model1=[]
for j in range(len(predicted_probs_model1)):
  model1.append(abs(sorted(predicted_probs_model1[j])[-1] - sorted(predicted_probs_model1[j])[-2]))

model2=[]
for j in range(len(predicted_probs_model2)):
  model2.append(abs(sorted(predicted_probs_model2[j])[-1] - sorted(predicted_probs_model2[j])[-2]))

model3=[]
for j in range(len(predicted_probs_model3)):
  model3.append(abs(sorted(predicted_probs_model3[j])[-1] - sorted(predicted_probs_model3[j])[-2]))

model4=[]
for j in range(len(predicted_probs_model4)):
  model4.append(abs(sorted(predicted_probs_model4[j])[-1] - sorted(predicted_probs_model4[j])[-2]))

model5=[]
for j in range(len(predicted_probs_model5)):
  model5.append(abs(sorted(predicted_probs_model5[j])[-1] - sorted(predicted_probs_model5[j])[-2]))

#print(len(model1),len(model2),len(model3),len(model4),len(model5))
i=0
final_predictions=[]
for max_prob in zip(model1,model2,model3,model4,model5):
    arr=max_prob
    index = arr.index(max(arr))

    if index==0:
      final_predictions.append(predicted_labels_model1[i])
    if index==1:
      final_predictions.append(predicted_labels_model2[i])
    if index==2:
      final_predictions.append(predicted_labels_model3[i])
    if index==3:
      final_predictions.append(predicted_labels_model4[i])
    if index==4:
      final_predictions.append(predicted_labels_model5[i])
    i+=1


## Saving Output File

In [None]:
####################################################
#MAKING OUTPUT FILE
####################################################
f=open("make.txt","w")
c=0
for i in test_sentences:
  for j in range(len(i)):
     f.write(big_f_test[0][c]['token']+" "+final_predictions[c] +"\n")
     c+=1
  f.write("\n")
f.close()
