In [1]:
from collections import defaultdict
import math

In [2]:
# Step 1: 
def read_data(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        for line in file:
            line = line.strip()
            if line and not line.startswith("#"):
                columns = line.split('\t')
                if len(columns) >= 2:
                    word, pos = columns[1], columns[3]
                    sentence.append((word, pos))
            elif sentence:
                sentences.append(sentence)
                sentence = []
    return sentences

In [3]:
train_data = read_data('C:/Users/rim-1/Downloads/train.conllu')
train_data 

[[('Les', 'DET'),
  ('commotions', 'NFP'),
  ('cérébrales', 'ADJFP'),
  ('sont', 'AUX'),
  ('devenu', 'VPPMS'),
  ('si', 'ADV'),
  ('courantes', 'ADJFP'),
  ('dans', 'PREP'),
  ('ce', 'PDEMMS'),
  ('sport', 'NMS'),
  ("qu'", 'COSUB'),
  ('on', 'PINDMS'),
  ('les', 'PPOBJMS'),
  ('considére', 'VERB'),
  ('presque', 'ADV'),
  ('comme', 'PREP'),
  ('la', 'DETFS'),
  ('routine', 'NFS'),
  ('.', 'YPFOR')],
 [("L'", 'DET'),
  ('œuvre', 'NFS'),
  ('est', 'AUX'),
  ('située', 'VPPFS'),
  ('dans', 'PREP'),
  ('la', 'DETFS'),
  ('galerie', 'NFS'),
  ('des', '_'),
  ('de', 'PREP'),
  ('les', 'DET'),
  ('batailles', 'NFP'),
  (',', 'PUNCT'),
  ('dans', 'PREP'),
  ('le', 'DETMS'),
  ('château', 'NMS'),
  ('de', 'PREP'),
  ('Versailles', 'PROPN'),
  ('.', 'YPFOR')],
 [('Le', 'DETMS'),
  ('comportement', 'NMS'),
  ('de', 'PREP'),
  ('la', 'DETFS'),
  ('Turquie', 'PROPN'),
  ('vis-à-vis', 'ADV'),
  ('du', '_'),
  ('de', 'PREP'),
  ('le', 'DETMS'),
  ('problème', 'NMS'),
  ('palestinien', 'ADJMS'),
  (

In [5]:
dev_data = read_data('C:/Users/rim-1/Downloads/dev.conllu')
test_data = read_data('C:/Users/rim-1/Downloads/test.conllu')

In [6]:
#step 2 : 
def train_hmm(train_data, k=0.1):
    transition_probs = defaultdict(lambda: defaultdict(lambda: k))
    emission_probs = defaultdict(lambda: defaultdict(lambda: k))
    tag_counts = defaultdict(int)

    for sentence in train_data:
        previous_tag = None
        for word, tag in sentence:
            if previous_tag is not None:
                transition_probs[previous_tag][tag] += 1
            emission_probs[tag][word] += 1
            tag_counts[tag] += 1
            previous_tag = tag

    total_tags = sum(tag_counts.values())

    for tag, transitions in transition_probs.items():
        total_transitions = sum(transitions.values())
        for next_tag in transitions:
            transition_probs[tag][next_tag] = (transition_probs[tag][next_tag] + k) / (total_transitions + k * total_tags)

    for tag, emissions in emission_probs.items():
        total_emissions = sum(emissions.values())
        for word in emissions:
            emission_probs[tag][word] = (emission_probs[tag][word] + k) / (total_emissions + k * total_tags)

    for tag in tag_counts:
        tag_counts[tag] /= total_tags

    return transition_probs, emission_probs, tag_counts


In [9]:
transition_probs, emission_probs, tag_counts = train_hmm(train_data, k=0.1)

# Print tag counts
print("Tag counts:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")


Tag counts:
DET: 0.06129013665469097
NFP: 0.01795805669838534
ADJFP: 0.007330883301449984
AUX: 0.03130789435403967
VPPMS: 0.01526009403072329
ADV: 0.036113726125226085
PREP: 0.1550820778978397
PDEMMS: 0.004696046922044523
NMS: 0.0720902211890248
COSUB: 0.007199141482479711
PINDMS: 0.0023411619079508933
PPOBJMS: 0.0034280319144556455
VERB: 0.051338140079978264
DETFS: 0.026707909174994306
NFS: 0.06202020590148456
YPFOR: 0.03666813961339265
VPPFS: 0.005996997384375969
_: 0.02658714584093822
PUNCT: 0.07041600223961092
DETMS: 0.037480547497042674
PROPN: 0.054145887596782204
ADJMS: 0.020085138150509538
PPER3FS: 0.0024152666811216718
ADJFS: 0.01944289678302946
COCO: 0.024627486283755412
NMP: 0.026334640687911866
PREL: 0.007256778528279204
PPER1S: 0.0013393751595311089
ADJMP: 0.007959401562787327
VPPMP: 0.0033731394898846986
DINTMS: 0.010278606500909842
PPER3MS: 0.009603429678687193
PPER3MP: 0.0012735042500459724
PREF: 0.005634707382207719
ADJ: 0.0026128794095770813
DINTFS: 0.00871691702186639

In [10]:
# transition probability from 'DET' to 'NFP'
print("Transition probability from 'DET' to 'NFP':", transition_probs['DET']['NFP'])

# transition probability from 'NFP' to 'ADJFP'
print("Transition probability from 'NFP' to 'ADJFP':", transition_probs['NFP']['ADJFP'])


Transition probability from 'DET' to 'NFP': 0.07082232991206344
Transition probability from 'NFP' to 'ADJFP': 0.03379323344138541


In [12]:
#step 3:
def viterbi(obs, states, transition_probs, emission_probs, tag_counts):
    V = [{}]
    for st in states:
        V[0][st] = {"prob": tag_counts[st] * emission_probs[st].get(obs[0], 1e-10), "prev": None}
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            max_tr_prob = max(V[t-1][prev_st]["prob"] * transition_probs[prev_st].get(st, 1e-10) for prev_st in states)
            for prev_st in states:
                if V[t-1][prev_st]["prob"] * transition_probs[prev_st].get(st, 1e-10) == max_tr_prob:
                    max_prob = max_tr_prob * emission_probs[st].get(obs[t], 1e-10)
                    V[t][st] = {"prob": max_prob, "prev": prev_st}
                    break
    opt = []
    max_prob = max(value["prob"] for value in V[-1].values())
    previous = None
    for st, data in V[-1].items():
        if data["prob"] == max_prob:
            opt.append(st)
            previous = st
            break
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]
    return opt


In [26]:
# Defining states and trying the viterbi algo on a sample :

states = transition_probs.keys()

sample_sentence = ["Le", "chat", "noir", "mange", "."]
tag_sequence = viterbi(sample_sentence, states, transition_probs, emission_probs, tag_counts)

print("Sample sentence:", sample_sentence)
print("Predicted POS tags:", tag_sequence)


Sample sentence: ['Le', 'chat', 'noir', 'mange', '.']
Predicted POS tags: ['DETMS', 'NMS', 'ADJMS', 'VERB', 'YPFOR']


In [27]:
#step 4 :
def predict_pos_tags(dev_data, transition_probs, emission_probs, tag_counts):
    predicted_tags = []
    for sentence in dev_data:
        words = [word for word, _ in sentence]
        predicted_tags.extend(viterbi(words, states, transition_probs, emission_probs, tag_counts))
    return predicted_tags

#trying it in the dev set:
# Training the HMM model
transition_probs, emission_probs, tag_counts = train_hmm(train_data, k=0.1)
states = transition_probs.keys()
predicted_tags_dev = predict_pos_tags(dev_data, transition_probs, emission_probs, tag_counts)
true_tags_dev = [tag for sentence in dev_data for _, tag in sentence]


In [28]:
#evaluation
from sklearn.metrics import accuracy_score, classification_report

def evaluate_performance(true_tags, predicted_tags):
    accuracy = accuracy_score(true_tags, predicted_tags)
    report = classification_report(true_tags, predicted_tags)
    return accuracy, report

In [29]:
# Évaluer les performances
accuracy_dev, report_dev = evaluate_performance(true_tags_dev, predicted_tags_dev)
print("Accuracy on dev set:", accuracy_dev)
print("Classification report on dev set:\n", report_dev)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy on dev set: 0.906512576478586
Classification report on dev set:
               precision    recall  f1-score   support

         ADJ       0.96      0.54      0.70       101
       ADJFP       0.88      0.72      0.79       294
       ADJFS       0.86      0.81      0.84       738
       ADJMP       0.82      0.67      0.74       292
       ADJMS       0.85      0.80      0.82       776
         ADV       0.89      0.96      0.92      1307
         AUX       0.94      0.94      0.94      1124
        CHIF       0.97      0.86      0.91       953
        COCO       0.99      0.99      0.99       884
       COSUB       0.86      0.74      0.79       256
         DET       0.93      0.98      0.95      2230
       DETFS       0.99      1.00      0.99      1007
       DETMS       0.95      1.00      0.97      1426
      DINTFS       0.98      1.00      0.99       306
      DINTMS       0.99      0.99      0.99       387
        INTJ       1.00      0.40      0.57         5
      M

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# trying on the test set
predicted_tags_test = predict_pos_tags(test_data, transition_probs, emission_probs, tag_counts)
true_tags_test = [tag for sentence in test_data for _, tag in sentence]


In [31]:
# Evaluating performance on test set
accuracy_test, report_test = evaluate_performance(true_tags_test, predicted_tags_test)
print("Accuracy on test set:", accuracy_test)
print("Classification report on test set:\n", report_test)

Accuracy on test set: 0.9105651582831618
Classification report on test set:
               precision    recall  f1-score   support

         ADJ       1.00      0.64      0.78        22
       ADJFP       0.85      0.62      0.72        76
       ADJFS       0.88      0.79      0.83       182
       ADJMP       0.79      0.68      0.73        82
       ADJMS       0.87      0.79      0.83       244
         ADV       0.90      0.97      0.93       504
         AUX       0.90      0.91      0.91       355
        CHIF       0.95      0.86      0.90       222
        COCO       1.00      1.00      1.00       245
       COSUB       0.89      0.74      0.81       128
         DET       0.94      0.98      0.96       645
       DETFS       1.00      1.00      1.00       240
       DETMS       0.96      1.00      0.98       362
      DINTFS       0.95      1.00      0.98        61
      DINTMS       0.99      1.00      1.00       122
        INTJ       0.67      0.33      0.44         6
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
#trying to actually see if it works by using a phrase from the test set knowing its real pos tags and seeing whether my model predicts them right 
#for that i changed my predict funciton to return a list of predicted tags for each word:
def predict_pos_tagss(sentence, transition_probs, emission_probs, tag_counts):
    predicted_tags = []
    for word, _ in sentence:
        predicted_tag = viterbi([word], states, transition_probs, emission_probs, tag_counts)[0]
        predicted_tags.append(predicted_tag)
    return predicted_tags

In [34]:
# Parse the provided sentence into a list of (word, POS) tuples
test_sentence = [("Je", "PPER1S"), ("sens", "VERB"), ("qu'", "COSUB"), ("entre", "PREP"), ("ça", "PDEMMS"),
                 ("et", "COCO"), ("les", "DET"), ("films", "NMP"), ("de", "PREP"), ("médecins", "NMP"),
                 ("et", "COCO"), ("scientifiques", "NMP"), ("fous", "ADJMP"), ("que", "PREL"), ("nous", "PREFP"),
                 ("avons", "AUX"), ("déjà", "ADV"), ("vus", "VPPMP"), (",", "PUNCT"), ("nous", "PREFP"),
                 ("pourrions", "VERB"), ("emprunter", "VERB"), ("un", "DINTMS"), ("autre", "ADJMS"),
                 ("chemin", "NMS"), ("pour", "PREP"), ("l'", "DET"), ("origine", "NFS"), (".", "YPFOR")]

# Predict POS tags for the test sentence
predicted_tags_test = predict_pos_tags([test_sentence], transition_probs, emission_probs, tag_counts)[0]

# Display the predicted tags
print("Predicted tags for the test sentence:", predicted_tags_test)

# Evaluate the performance of your model (if gold-standard annotations are available in the test sentence)
accuracy_test, report_test = evaluate_performance([tag for _, tag in test_sentence], predicted_tags_test)
print("Accuracy on test sentence:", accuracy_test)
print("Classification report on test sentence:\n", report_test)

predicted_tags_test = predict_pos_tagss(test_sentence, transition_probs, emission_probs, tag_counts)
print("Predicted tags for the test sentence:", predicted_tags_test)


Predicted tags for the test sentence: ['PPER1S', 'NMS', 'COSUB', 'PREP', 'PDEMMS', 'COCO', 'DET', 'NMP', 'PREP', 'NMP', 'COCO', 'NMP', 'NMP', 'COSUB', 'PREFP', 'AUX', 'ADV', 'VPPMP', 'PUNCT', 'PREFP', 'PREP', 'PREP', 'DINTMS', 'ADJMS', 'NMS', 'PREP', 'DET', 'NFS', 'YPFOR']


In [35]:
# Evaluating
accuracy_test, report_test = evaluate_performance([tag for _, tag in test_sentence], predicted_tags_test)
print("Accuracy on test sentence:", accuracy_test)
print("Classification report on test sentence:\n", report_test)

Accuracy on test sentence: 0.8275862068965517
Classification report on test sentence:
               precision    recall  f1-score   support

       ADJMP       0.00      0.00      0.00         1
       ADJMS       1.00      1.00      1.00         1
         ADV       1.00      1.00      1.00         1
         AUX       1.00      1.00      1.00         1
        COCO       1.00      1.00      1.00         2
       COSUB       0.50      1.00      0.67         1
         DET       1.00      1.00      1.00         2
      DINTMS       1.00      1.00      1.00         1
         NFS       1.00      1.00      1.00         1
         NMP       0.75      1.00      0.86         3
         NMS       0.50      1.00      0.67         1
      PDEMMS       1.00      1.00      1.00         1
      PPER1S       1.00      1.00      1.00         1
       PREFP       1.00      1.00      1.00         2
        PREL       0.00      0.00      0.00         1
        PREP       0.60      1.00      0.75     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
