<a href="https://colab.research.google.com/github/rachanabn20/CoLI-Dravidian_2025/blob/main/CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0


In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import re
import sklearn_crfsuite
import string
from sklearn_crfsuite import metrics

In [17]:
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def build_vocab(sentences):
    word_counts = Counter()
    for sentence in sentences:
        for word in sentence.split():
            word_counts[word] += 1
    vocab = {word: idx+2 for idx, word in enumerate(word_counts)}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

In [18]:
def create_sentences_from_df(df, has_tags=True):
    sentences, current = [], []
    for index, row in df.iterrows():
        word = str(row['Word'])
        tag = str(row['Tag']) if has_tags else None
        if 'ID' in df.columns and row['ID'] == 1.0 and index != 0:
            if current: sentences.append(current)
            current = []
        current.append((word, tag))
    if current:
        sentences.append(current)
    return sentences


In [19]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'word.lower()': word.lower(),
        'word[:3]': word[:3],
        'word[-3:]': word[-3:],
        'word.isdigit()': word.isdigit(),
        'len(word)': len(word),
        'ispunct': word in string.punctuation
    }

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:word[:3]': word1[:3],
            '-1:word[-3:]': word1[-3:],
            '-1:word.isdigit()': word1.isdigit()
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:word[:3]': word1[:3],
            '+1:word[-3:]': word1[-3:],
            '+1:word.isdigit()': word1.isdigit()
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent): return [label for token, label in sent]


In [20]:
# Example file triplets (train, val, test)
file_triples = [
    ('kan_train.csv', 'kan_val.csv', 'kan_test.csv'),
    ('mal_train(1).csv', 'mal_val(1).csv', 'mal_test.csv'),
    ('tl_train.csv', 'tl_val.csv', 'tl_test.csv'),
    ('tm_train.csv', 'tm_val.csv', 'tm_test.csv'),
    ('tulu_train.csv', 'tulu_val.csv', 'tulu_test.csv')
]

dataframes = []
for train_file, val_file, test_file in file_triples:
    train_df = pd.read_csv(f'/content/drive/MyDrive/Train_and_development_data/Train and development data/{train_file}')
    val_df = pd.read_csv(f'/content/drive/MyDrive/Train_and_development_data/Train and development data/{val_file}')
    test_df = pd.read_csv(f'/content/drive/MyDrive/Test_Data_without_labels/Test Data without labels/{test_file}')
    dataframes.append((train_df, val_df, test_df, train_file, val_file, test_file))


In [22]:
for train_df, val_df, test_df, train_file, val_file, test_file in dataframes:
    print(f"\n Processing {train_file}, {val_file}, {test_file}")

    # Prepare structured sequences
    train_sents = create_sentences_from_df(train_df, has_tags=True)
    val_sents = create_sentences_from_df(val_df, has_tags=True)
    test_sents = create_sentences_from_df(test_df, has_tags=False)

    # Features & Labels
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_val = [sent2features(s) for s in val_sents]
    y_val = [sent2labels(s) for s in val_sents]

    X_test = [sent2features(s) for s in test_sents]

    # CRF Model
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1, c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )

    print("Training...")
    crf.fit(X_train, y_train)

    # Evaluate on validation
    val_pred = crf.predict(X_val)
    print(f"\nValidation Report for {val_file}")
    print(metrics.flat_classification_report(y_val, val_pred, digits=3))

    # Predict on test
    test_preds = crf.predict(X_test)
    test_labels_flat = [tag for seq in test_preds for tag in seq]

    # Save predictions
    if len(test_labels_flat) == len(test_df):
        language = train_file.split('_')[0]
        test_df['Tag'] = test_labels_flat
        output_path = f'predictions_{language}.csv'
        test_df[['Word', 'Tag']].to_csv(output_path, index=False)
        print(f"Test predictions saved to: {output_path}")
    else:
        print(f"Prediction mismatch in length for {test_file}")


 Processing kan_train.csv, kan_val.csv, kan_test.csv
Training...

Validation Report for kan_val.csv
              precision    recall  f1-score   support

          en      0.935     0.971     0.953       922
          kn      0.889     0.925     0.907       546
    location      1.000     1.000     1.000         2
       mixed      0.976     0.705     0.818       176
        name      0.923     0.480     0.632        50
       other      0.645     0.851     0.734        47
         sym      0.996     1.000     0.998       273

    accuracy                          0.924      2016
   macro avg      0.909     0.847     0.863      2016
weighted avg      0.928     0.924     0.922      2016

Test predictions saved to: predictions_kan_crf_val.csv

 Processing mal_train(1).csv, mal_val(1).csv, mal_test.csv
Training...

Validation Report for mal_val(1).csv
              precision    recall  f1-score   support

     ENGLISH      0.908     0.941     0.924       407
   MALAYALAM      0.907     

In [23]:
from sklearn.metrics import classification_report
import pandas as pd

labeled_test_files = {
    'kan': "/content/drive/MyDrive/CoLI-Dravidian/test_ref_knn.csv",
    'mal': "/content/drive/MyDrive/CoLI-Dravidian/test_ref_mal.csv",
    'tl': "/content/drive/MyDrive/CoLI-Dravidian/test_ref_tl.csv",
    'tm': "/content/drive/MyDrive/CoLI-Dravidian/test_ref_tm.csv",
    'tulu': "/content/drive/MyDrive/CoLI-Dravidian/test_ref_tulu.csv"
}

for language, labeled_test_full_path in labeled_test_files.items():
    prediction_file = f'predictions_{language}_crf_val.csv'

    try:
        # Load predictions
        predictions_df = pd.read_csv(prediction_file)

        # Load labeled test data
        labeled_test_df = pd.read_csv(labeled_test_full_path)

        # Ensure the lengths match before comparison
        if len(predictions_df) == len(labeled_test_df):
            print(f"Classification Report for {language}:")
            print(classification_report(labeled_test_df['Tag'], predictions_df['Tag']))
        else:
            print(f"Length mismatch for {language}: Predictions have {len(predictions_df)} rows, Labeled test has {len(labeled_test_df)} rows.")

    except FileNotFoundError:
        print(f"Could not find prediction file {prediction_file} or labeled test file for {language}. Please check file paths and names.")
    except Exception as e:
        print(f"An error occurred while processing files for {language}: {e}")

Classification Report for kan:
              precision    recall  f1-score   support

          en       0.97      0.98      0.97      1204
          kn       0.88      0.94      0.91       289
    location       0.80      0.80      0.80         5
       mixed       0.97      0.77      0.86       141
        name       0.91      0.76      0.83        55
       other       0.85      0.85      0.85       117
         sym       1.00      1.00      1.00       264

    accuracy                           0.95      2075
   macro avg       0.91      0.87      0.89      2075
weighted avg       0.95      0.95      0.95      2075

Classification Report for mal:
              precision    recall  f1-score   support

     ENGLISH       0.90      0.85      0.88       380
   MALAYALAM       0.87      0.96      0.91       938
       MIXED       0.69      0.28      0.40        88
        NAME       0.74      0.70      0.72       158
      NUMBER       0.98      0.86      0.91        50
       OTHER    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for tm:
              precision    recall  f1-score   support

    Location       0.00      0.00      0.00         9
       Other       0.14      0.12      0.13        16
          en       0.89      0.91      0.90       534
        name       0.73      0.68      0.70       139
         sym       1.00      0.91      0.95       230
          tm       0.89      0.93      0.91       986
        tmen       0.79      0.72      0.75       152

    accuracy                           0.88      2066
   macro avg       0.63      0.61      0.62      2066
weighted avg       0.87      0.88      0.88      2066



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for tulu:
              precision    recall  f1-score   support

     English       0.94      0.90      0.92       813
     Kannada       0.81      0.69      0.74       372
    Location       0.97      0.62      0.76        56
       Mixed       0.86      0.37      0.52        65
        Name       0.85      0.75      0.80       133
       Other       0.61      0.61      0.61        59
        Tulu       0.84      0.94      0.89      1330
         sym       1.00      1.00      1.00       455

    accuracy                           0.88      3283
   macro avg       0.86      0.74      0.78      3283
weighted avg       0.88      0.88      0.88      3283

