In [4]:
import numpy as np
from sklearn import preprocessing
import string
from tabulate import tabulate

LABEL_TYPES = {'company', 'vbd', 'person', 'cc', 'date', 'in', 'amount', 'duration'}

In [5]:
def preprocess_text(text):
    cleaned_text = ''.join([word for word in text if word not in string.punctuation])
    tokens = cleaned_text.split(" ")
    return tokens

In [6]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [7]:
def generate_feature_functions(label_types):
    feature_functions = [
        lambda position, label1, label2: int(position == 0),
    ]

    for label1 in label_types:
        for label2 in label_types:
            feature_functions.append(lambda position, l1=label1, l2=label2: int(text_train_tags.get(text_train[max(0, position - 1)], '') == l1 and text_train_tags.get(text_train[position], '') == l2))

    return feature_functions

In [8]:
text_train = "Google hired XYZ in 2005, for $100,000 per annum."
text_test = "Apple hired ABC in 1995, for $10 per hour."

text_train = preprocess_text(text_train)
text_test = preprocess_text(text_test)


In [9]:
text_train_tags = {
    'Google': 'company',
    'hired': 'vbd',
    'XYZ': 'person',
    'in': 'cc',
    '2005': 'date',
    'for': 'in',
    '100000': 'amount',
    'per': 'in',
    'annum': 'duration'
}


feature_func = generate_feature_functions(LABEL_TYPES)

In [10]:
weights = np.random.rand(len(feature_func))
np.set_printoptions(suppress=True)
epochs = 5
learning_rate = 0.05

for epoch in range(epochs):
    for token_pos in range(len(text_train)):
        if text_train[token_pos] in text_train_tags:
            features = [f(token_pos, text_train_tags.get(text_train[max(0, token_pos - 1)], ''), text_train_tags.get(text_train[token_pos], '')) for f in feature_func]
            weights += learning_rate * np.multiply(weights, features)

    print(f"Epoch {epoch} Updated weights: ",softmax(weights))

weights = preprocessing.normalize([weights])

Epoch 0 Updated weights:  [0.01657684 0.00768371 0.02303112 0.0091025  0.02000166 0.008214
 0.01742058 0.01599503 0.00966755 0.02145727 0.01273574 0.0147577
 0.0276122  0.01566099 0.00727098 0.01324132 0.02278519 0.01477745
 0.02291874 0.00902008 0.00952149 0.02070504 0.00971179 0.00700216
 0.01653353 0.00766292 0.02481472 0.00783717 0.01338783 0.02162266
 0.00699731 0.02203066 0.01086969 0.02088893 0.01231717 0.01420193
 0.01066792 0.01668393 0.01999609 0.01798903 0.01609942 0.01304213
 0.02189315 0.00914234 0.00687407 0.01320614 0.0242446  0.0264546
 0.00685645 0.00827969 0.01243399 0.01933687 0.00677793 0.01952646
 0.02851719 0.01260005 0.00739855 0.00928296 0.01692183 0.02958445
 0.0101412  0.02863214 0.01021034 0.0215486  0.01962223]
Epoch 1 Updated weights:  [0.01024527 0.00490536 0.0269313  0.00638017 0.02163912 0.00544044
 0.01746445 0.01529802 0.007005   0.02413061 0.01074273 0.01350179
 0.03568453 0.01480526 0.0045027  0.01141151 0.02648649 0.01352982
 0.02672771 0.00629078 0

In [11]:
tags = list(LABEL_TYPES)
pred_dict = {}
prev = 'company'

for token_pos in range(1, len(text_test)):
    probabilities = []
    for tag in tags:
        features = [f(token_pos, prev, tag) for f in feature_func]
        weighted_features = np.multiply(weights, features)
        probability = sum(weighted_features[0])
        probabilities.append(probability)

    probabilities = softmax(probabilities)
    prev = tags[np.argmax(probabilities)]
    pred_dict[text_test[token_pos]] = [round(prob, 3) for prob in probabilities]

In [12]:
table_data = []
for token, probs in pred_dict.items():
    table_data.append([token] + probs )

In [13]:
headers = ["Token"] + tags
table = tabulate(table_data, headers=headers, tablefmt="plain")
print(table)

Token      person    date     in    duration    amount     cc    vbd    company
hired       0.001   0.001  0.001       0.001     0.001  0.001  0.993      0.001
ABC         0.993   0.001  0.001       0.001     0.001  0.001  0.001      0.001
in          0.001   0.001  0.001       0.001     0.001  0.993  0.001      0.001
1995        0.001   0.993  0.001       0.001     0.001  0.001  0.001      0.001
for         0.001   0.001  0.993       0.001     0.001  0.001  0.001      0.001
10          0.001   0.001  0.001       0.001     0.993  0.001  0.001      0.001
per         0.001   0.001  0.993       0.001     0.001  0.001  0.001      0.001
hour        0.001   0.001  0.001       0.993     0.001  0.001  0.001      0.001


In [35]:
import numpy as np
from sklearn import preprocessing
import string
from tabulate import tabulate

LABEL_TYPES = {'company', 'vbd', 'person', 'cc', 'date', 'in', 'amount', 'duration'}

In [36]:
text_train="Google hired XYZ in 2005, for $100,000 per annum."
text_train=''.join([word for word in text_train if word not in string.punctuation]).split(" ")

text_train_tags = {
    'Google' : 'company',
    'hired'  : 'vbd',
    'XYZ'    : 'person',
    'in'     : 'cc',
    '2005'   : 'date',
    'for'    : 'in',
    '100000' : 'amount',
    'per'    : 'in',
    'annum'  : 'duration'
}
text_test = 'Apple hired ABC in 1995, for $10 per hour.'
text_test=''.join([word for word in text_test if word not in string.punctuation]).split(" ")

print(f"Train data : {text_train}\nTest data : {text_test}")

Train data : ['Google', 'hired', 'XYZ', 'in', '2005', 'for', '100000', 'per', 'annum']
Test data : ['Apple', 'hired', 'ABC', 'in', '1995', 'for', '10', 'per', 'hour']


In [37]:
import numpy as np
def softmax(x):
    e_x =np.exp(x-np.max(x))
    return np.round(e_x / e_x.sum(axis=0),3)


feature_func =[
    lambda position , label1, label2 : 1 if position == 0 else 0,
    lambda position , label1, label2 : 1 if label1   == 'company' and label2 == 'vbd'      else 0,
    lambda position , label1, label2 : 1 if label1   == 'vbd'     and label2 == 'person'   else 0,
    lambda position , label1, label2 : 1 if label1   == 'person'  and label2 == 'cc'       else 0,
    lambda position , label1, label2 : 1 if label1   == 'cc'      and label2 == 'date'     else 0,
    lambda position , label1, label2 : 1 if label1   == 'date'    and label2 == 'in'       else 0,
    lambda position , label1, label2 : 1 if label1   == 'in'      and label2 == 'amount'   else 0,
    lambda position , label1, label2 : 1 if label1   == 'amount'  and label2 == 'in'       else 0,
    lambda position , label1, label2 : 1 if label1   == 'in'      and label2 == ' '        else 0
]

In [38]:
# Calculate transition scores using feature functions and weights
def calculate_transition_scores(position, prev_label, current_label, feature_functions, weights):
    features = [f(position, prev_label, current_label) for f in feature_functions]
    return np.dot(weights, features)

In [39]:
# Initialize weights with random values
weights = np.random.rand(len(feature_func))

In [40]:
# Hyperparameters
epochs = 5
learning_rate = 0.05

for epoch in range(epochs):
    for token_pos in range(len(text_train)):
        if text_train[token_pos] in text_train_tags:
            for current_label in LABEL_TYPES:
                transition_scores = [
                    calculate_transition_scores(token_pos, text_train_tags.get(text_train[max(0, token_pos - 1)], ''), current_label, feature_func, weights)
                    for _ in LABEL_TYPES
                ]
                softmax_scores = softmax(transition_scores)
                for i, label in enumerate(LABEL_TYPES):
                    features = [f(token_pos, text_train_tags.get(text_train[max(0, token_pos - 1)], ''), current_label) for f in feature_func]
                    weights += learning_rate * (int(current_label == label) - softmax_scores[i]) * np.array(features)

In [41]:
# Normalize weights
normalized_weights = preprocessing.normalize([weights])

In [42]:
# Inference on test data
tags = list(LABEL_TYPES)
predictions = {}
prev = 'company'

for token_pos in range(1, len(text_test)):
    probabilities = []
    for tag in tags:
        transition_scores = [
            calculate_transition_scores(token_pos, prev, tag, feature_func, normalized_weights)
            for i in tags
        ]
        probabilities.append(softmax(transition_scores)[tags.index(tag)])
    
#     prev = tags[np.argmax(probabilities)]
#     predictions[text_test[token_pos]] = [round(float(prob), 3) for prob in probabilities]  # Convert to float before rounding
    
    max_prob_index = np.argmax(probabilities)
    max_prob_tag = tags[max_prob_index]
    max_prob_value = probabilities[max_prob_index]
    
    prev = max_prob_tag
    formatted_max_prob = f"{max_prob_tag} => {float(max_prob_value):.3f}"  # Convert to float before formatting
    predictions[text_test[token_pos]] = [round(float(prob), 3) for prob in probabilities] + [formatted_max_prob]

In [51]:
import numpy as np
from sklearn import preprocessing
import string
from tabulate import tabulate

LABEL_TYPES = {'company', 'vbd', 'person', 'cc', 'date', 'in', 'amount', 'duration'}

def clean_text(text):
    cleaned_text = ''.join([word for word in text if word not in string.punctuation])
    tokens = cleaned_text.split(" ")
    return tokens

text_train = "Google hired XYZ in 2005, for $100,000 per annum."
text_test = "Apple hired ABC in 1995, for $10 per hour."

text_train = clean_text(text_train)
text_test = clean_text(text_test)

text_train_tags = {
    'Google': 'company',
    'hired': 'vbd',
    'XYZ': 'person',
    'in': 'cc',
    '2005': 'date',
    'for': 'in',
    '100000': 'amount',
    'per': 'in',
    'annum': 'duration'
}

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.round(e_x / e_x.sum(axis=0), 3)

def generate_feature_functions(label_types):
    feature_functions = [
        lambda position, label1, label2: 1 if position == 0 else 0,
    ]

    for label1 in label_types:
        for label2 in label_types:
            feature_functions.append(
                lambda position, l1=label1, l2=label2: int(text_train_tags.get(text_train[max(0, position - 1)], '') == l1 and text_train_tags.get(text_train[position], '') == l2)
            )

    return feature_functions

feature_func = generate_feature_functions(LABEL_TYPES)

weights = np.random.rand(len(feature_func))
np.set_printoptions(suppress=True)
epochs = 5
learning_rate = 0.05

for epoch in range(epochs):
    for token_pos in range(len(text_train)):
        if text_train[token_pos] in text_train_tags:
            features = [f(token_pos, text_train_tags.get(text_train[max(0, token_pos - 1)], ''), text_train_tags.get(text_train[token_pos], '')) for f in feature_func]
            weights += learning_rate * np.multiply(weights, features)

    print(f"Epoch {epoch} Updated weights: ", softmax(weights))

normalized_weights = preprocessing.normalize([weights])

tag_labels = list(LABEL_TYPES)
predictions = {}
previous_tag = 'company'

for token_pos in range(1, len(text_test)):
    probabilities = []
    for tag in tag_labels:
        features = [f(token_pos, previous_tag, tag) for f in feature_func]
        weighted_features = np.multiply(normalized_weights, features)
        probability = sum(weighted_features[0])
        probabilities.append(probability)

    probabilities = softmax(probabilities)
    previous_tag = tag_labels[np.argmax(probabilities)]
    predictions[text_test[token_pos]] = [round(prob, 3) for prob in probabilities]

table_data = []
for token, probs in predictions.items():
    table_data.append([token] + probs)

headers = ["Token"] + tag_labels
table = tabulate(table_data, headers=headers, tablefmt="plain")
print(table)


Epoch 0 Updated weights:  [0.009 0.014 0.019 0.025 0.009 0.011 0.026 0.016 0.026 0.019 0.024 0.014
 0.027 0.026 0.02  0.009 0.009 0.019 0.022 0.007 0.011 0.007 0.01  0.009
 0.029 0.014 0.013 0.021 0.009 0.013 0.012 0.011 0.017 0.013 0.013 0.011
 0.007 0.008 0.022 0.008 0.027 0.012 0.011 0.014 0.027 0.018 0.017 0.02
 0.02  0.02  0.007 0.019 0.015 0.013 0.017 0.023 0.008 0.011 0.015 0.017
 0.008 0.019 0.008 0.008 0.017]
Epoch 1 Updated weights:  [0.005 0.012 0.02  0.031 0.006 0.009 0.033 0.015 0.032 0.02  0.029 0.012
 0.035 0.032 0.022 0.006 0.006 0.021 0.025 0.004 0.008 0.005 0.007 0.006
 0.038 0.013 0.012 0.023 0.006 0.011 0.01  0.009 0.017 0.011 0.011 0.008
 0.004 0.005 0.025 0.005 0.034 0.009 0.009 0.013 0.034 0.019 0.018 0.021
 0.022 0.021 0.004 0.02  0.014 0.011 0.017 0.027 0.005 0.009 0.013 0.017
 0.005 0.02  0.006 0.006 0.018]
Epoch 2 Updated weights:  [0.002 0.009 0.02  0.04  0.003 0.005 0.044 0.012 0.042 0.021 0.035 0.009
 0.047 0.041 0.023 0.003 0.003 0.021 0.028 0.002 0.005 0

In [45]:
import numpy as np
from sklearn import preprocessing
import string
from tabulate import tabulate

LABEL_TYPES = {'company', 'vbd', 'person', 'cc', 'date', 'in', 'amount', 'duration'}

text_train = "Google hired XYZ in 2005, for $100,000 per annum."
text_train = ''.join([word for word in text_train if word not in string.punctuation]).split(" ")

text_train_tags = {
    'Google': 'company',
    'hired': 'vbd',
    'XYZ': 'person',
    'in': 'cc',
    '2005': 'date',
    'for': 'in',
    '100000': 'amount',
    'per': 'in',
    'annum': 'duration'
}

text_test = 'Apple hired ABC in 1995, for $10 per hour.'
text_test = ''.join([word for word in text_test if word not in string.punctuation]).split(" ")

print(f"Train data : {text_train}\nTest data : {text_test}")

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.round(e_x / e_x.sum(axis=0), 3)

def generate_feature_functions(label_types):
    feature_functions = [
        lambda position, label1, label2: 1 if position == 0 else 0,
    ]

    for label1 in label_types:
        for label2 in label_types:
            feature_functions.append(
                lambda position, l1=label1, l2=label2: 1 if text_train_tags.get(text_train[max(0, position - 1)], '') == l1 and text_train_tags.get(text_train[position], '') == l2 else 0
            )

    return feature_functions

feature_func = generate_feature_functions(LABEL_TYPES)

weights = np.random.rand(len(feature_func))
np.set_printoptions(suppress=True)
epochs = 5
learning_rate = 0.05

for epoch in range(epochs):
    for token_pos in range(len(text_train)):
        if text_train[token_pos] in text_train_tags:
            features = [f(token_pos, text_train_tags.get(text_train[max(0, token_pos - 1)], ''), text_train_tags.get(text_train[token_pos], '')) for f in feature_func]
            weights += learning_rate * np.multiply(weights, features)

    print(f"Epoch {epoch} Updated weights: ", softmax(weights))

weights = preprocessing.normalize([weights])

tags = list(LABEL_TYPES)
pred_dict = {}
prev = 'company'

for token_pos in range(1, len(text_test)):
    probabilities = []
    for tag in tags:
        features = [f(token_pos, prev, tag) for f in feature_func]
        weighted_features = np.multiply(weights, features)
        probability = sum(weighted_features[0])
        probabilities.append(probability)

    probabilities = softmax(probabilities)
    prev = tags[np.argmax(probabilities)]
    pred_dict[text_test[token_pos]] = [round(prob, 3) for prob in probabilities]

table_data = []
for token, probs in pred_dict.items():
    table_data.append([token] + probs)

headers = ["Token"] + tags
table = tabulate(table_data, headers=headers, tablefmt="plain")
print(table)


Train data : ['Google', 'hired', 'XYZ', 'in', '2005', 'for', '100000', 'per', 'annum']
Test data : ['Apple', 'hired', 'ABC', 'in', '1995', 'for', '10', 'per', 'hour']
Epoch 0 Updated weights:  [0.015 0.012 0.026 0.017 0.018 0.007 0.008 0.015 0.02  0.011 0.012 0.018
 0.007 0.022 0.017 0.016 0.02  0.008 0.027 0.009 0.018 0.016 0.021 0.018
 0.016 0.015 0.008 0.027 0.018 0.015 0.011 0.008 0.01  0.014 0.016 0.027
 0.011 0.018 0.02  0.008 0.009 0.014 0.008 0.022 0.015 0.022 0.012 0.018
 0.014 0.018 0.016 0.016 0.012 0.016 0.026 0.011 0.015 0.007 0.008 0.013
 0.009 0.022 0.025 0.01  0.022]
Epoch 1 Updated weights:  [0.009 0.01  0.032 0.017 0.018 0.004 0.005 0.014 0.021 0.009 0.01  0.018
 0.004 0.025 0.017 0.015 0.023 0.006 0.036 0.007 0.018 0.016 0.024 0.019
 0.016 0.015 0.005 0.035 0.019 0.014 0.009 0.006 0.007 0.013 0.016 0.035
 0.009 0.019 0.022 0.005 0.006 0.012 0.006 0.025 0.014 0.026 0.011 0.019
 0.013 0.019 0.015 0.015 0.01  0.015 0.032 0.008 0.014 0.005 0.006 0.011
 0.007 0.026 0.031 

## Final Code

In [52]:
import numpy as np
from sklearn import preprocessing
import string
from tabulate import tabulate

LABEL_TYPES = {'company', 'vbd', 'person', 'cc', 'date', 'in', 'amount', 'duration'}

text_train = "Google hired XYZ in 2005, for $100,000 per annum."
text_train = ''.join([word for word in text_train if word not in string.punctuation]).split(" ")

text_train_tags = {
    'Google': 'company',
    'hired': 'vbd',
    'XYZ': 'person',
    'in': 'cc',
    '2005': 'date',
    'for': 'in',
    '100000': 'amount',
    'per': 'in',
    'annum': 'duration'
}

text_test = 'Apple hired ABC in 1995, for $10 per hour.'
text_test = ''.join([word for word in text_test if word not in string.punctuation]).split(" ")

print(f"Train data : {text_train}\nTest data : {text_test}")

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.round(e_x / e_x.sum(axis=0), 3)

def generate_feature_functions(label_types):
    feature_functions = [
        lambda position, label1, label2: 1 if position == 0 else 0,
    ]

    for label1 in label_types:
        for label2 in label_types:
            feature_functions.append(
                lambda position, l1=label1, l2=label2: 1 if text_train_tags.get(text_train[max(0, position - 1)], '') == l1 and text_train_tags.get(text_train[position], '') == l2 else 0
            )

    return feature_functions

feature_func = generate_feature_functions(LABEL_TYPES)

weights = np.random.rand(len(feature_func))
np.set_printoptions(suppress=True)
epochs = 5
learning_rate = 0.05

for epoch in range(epochs):
    for token_pos in range(len(text_train)):
        if text_train[token_pos] in text_train_tags:
            features = [f(token_pos, text_train_tags.get(text_train[max(0, token_pos - 1)], ''), text_train_tags.get(text_train[token_pos], '')) for f in feature_func]
            weights += learning_rate * np.multiply(weights, features)

    print(f"Epoch {epoch} Updated weights: ", softmax(weights))

weights = preprocessing.normalize([weights])

tags = list(LABEL_TYPES)
pred_dict = {}
prev = 'company'

for token_pos in range(1, len(text_test)):
    probabilities = []
    for tag in tags:
        features = [f(token_pos, prev, tag) for f in feature_func]
        weighted_features = np.multiply(weights, features)
        probability = sum(weighted_features[0])
        probabilities.append(probability)

    probabilities = softmax(probabilities)
    prev = tags[np.argmax(probabilities)]
    max_prob_label = tags[np.argmax(probabilities)]
    max_prob_value = max(probabilities)
    max_prob_tag = f"{max_prob_label} => {max_prob_value}"
    pred_dict[text_test[token_pos]] = [round(prob, 3) for prob in probabilities] + [max_prob_tag]
    print(f"Token: {text_test[token_pos]}, Max Probability Label: {max_prob_label}, Max Probability: {max(probabilities)}")

table_data = []
for token, probs in pred_dict.items():
    table_data.append([token] + probs)

headers = ["Token"] + tags + ["Max Prob Tag"]
table = tabulate(table_data, headers=headers, tablefmt="plain")
print(table)


Train data : ['Google', 'hired', 'XYZ', 'in', '2005', 'for', '100000', 'per', 'annum']
Test data : ['Apple', 'hired', 'ABC', 'in', '1995', 'for', '10', 'per', 'hour']
Epoch 0 Updated weights:  [0.013 0.022 0.019 0.009 0.021 0.029 0.017 0.028 0.01  0.013 0.008 0.019
 0.023 0.019 0.012 0.008 0.018 0.015 0.009 0.012 0.014 0.007 0.012 0.018
 0.012 0.013 0.01  0.01  0.016 0.015 0.017 0.009 0.028 0.01  0.013 0.006
 0.015 0.008 0.022 0.026 0.013 0.013 0.022 0.019 0.007 0.015 0.021 0.021
 0.022 0.026 0.019 0.012 0.017 0.021 0.02  0.009 0.016 0.019 0.024 0.008
 0.01  0.009 0.011 0.013 0.007]
Epoch 1 Updated weights:  [0.008 0.025 0.02  0.007 0.023 0.038 0.016 0.038 0.008 0.011 0.005 0.02
 0.027 0.019 0.01  0.005 0.018 0.014 0.007 0.009 0.012 0.004 0.01  0.019
 0.01  0.012 0.007 0.008 0.016 0.013 0.016 0.007 0.038 0.007 0.012 0.004
 0.014 0.005 0.025 0.034 0.011 0.011 0.026 0.021 0.004 0.014 0.024 0.023
 0.025 0.032 0.021 0.01  0.016 0.024 0.022 0.006 0.016 0.02  0.029 0.005
 0.008 0.007 0.009 0