UPDATED

In [None]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

!pip install sklearn_crfsuite

import pandas as pd
import os
import numpy as np
import string
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

# Define function to read and format data from CSV files
def read_and_format_csv(file_path):
    sents = []
    sent = []

    # Read the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Assuming the CSV has two columns: 'Word' and 'Tag'
    for _, row in data.iterrows():
        word = row['Word']
        tag = row['Tag']

        # Check if the word and tag are valid (not NaN or None)
        if pd.isna(word) or pd.isna(tag):
            continue

        # Check if word is '.' to signify the end of a sentence
        if word == '.':
            if sent:
                sents.append(sent)
                sent = []
        else:
            sent.append((str(word), str(tag)))  # Ensure word and tag are strings

    # Append the last sentence if it exists
    if sent:
        sents.append(sent)

    return sents


# Feature extraction functions
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.ispunctuation': word in string.punctuation,
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1l = sent[i-1][0]
        features.update({
            '-1:word': word1l,
            '-1:len(word)': len(word1l),
            '-1:word[:3]': word1l[:3],
            '-1:word[:2]': word1l[:2],
            '-1:word[-3:]': word1l[-3:],
            '-1:word[-2:]': word1l[-2:],
            '-1:word.isdigit()': word1l.isdigit(),
            '-1:word.ispunctuation': word1l in string.punctuation,
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2l = sent[i-2][0]
        features.update({
            '-2:word': word2l,
            '-2:len(word)': len(word2l),
            '-2:word[:3]': word2l[:3],
            '-2:word[:2]': word2l[:2],
            '-2:word[-3:]': word2l[-3:],
            '-2:word[-2:]': word2l[-2:],
            '-2:word.isdigit()': word2l.isdigit(),
            '-2:word.ispunctuation': word2l in string.punctuation,
        })

    if i < len(sent) - 1:
        word1r = sent[i+1][0]
        features.update({
            '+1:word': word1r,
            '+1:len(word)': len(word1r),
            '+1:word[:3]': word1r[:3],
            '+1:word[:2]': word1r[:2],
            '+1:word[-3:]': word1r[-3:],
            '+1:word[-2:]': word1r[-2:],
            '+1:word.isdigit()': word1r.isdigit(),
            '+1:word.ispunctuation': word1r in string.punctuation,
        })
    else:
        features['EOS'] = True

    if i < len(sent) - 2:
        word2r = sent[i+2][0]
        features.update({
            '+2:word': word2r,
            '+2:len(word)': len(word2r),
            '+2:word[:3]': word2r[:3],
            '+2:word[:2]': word2r[:2],
            '+2:word[-3:]': word2r[-3:],
            '+2:word[-2:]': word2r[-2:],
            '+2:word.isdigit()': word2r.isdigit(),
            '+2:word.ispunctuation': word2r in string.punctuation,
        })

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]


# Updated datasets with correct file paths
datasets = {
    "Tamil": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tamil_dataset.csv",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tamil_validation"
    },
    "Malayalam": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /Final_mal_train(80%)  (1).csv",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /Final_mal_dev(20%) (1).csv"
    },
    "Tulu": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tulu_train_set",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tulu_validation_set"
    },
    "Kannada": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_kannada_train",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_kannada_validation"
    }
}

for language, paths in datasets.items():
    print(f"\nProcessing {language} dataset...")

    # Check if the file paths exist
    if not os.path.exists(paths['train']) or not os.path.exists(paths['validation']):
        print(f"File paths for {language} dataset are incorrect or files do not exist.")
        continue

    try:
        # Read and format the data using the CSV-specific function
        train_sents = read_and_format_csv(paths['train'])
        test_sents = read_and_format_csv(paths['validation'])

        # Check if the datasets are loaded properly
        if not train_sents or not test_sents:
            print(f"Failed to load data for {language}. Check the data format.")
            continue

        # Prepare features and labels for training and testing
        X_train = [sent2features(s) for s in train_sents]
        y_train = [sent2labels(s) for s in train_sents]
        X_test = [sent2features(s) for s in test_sents]
        y_test = [sent2labels(s) for s in test_sents]

        # Train CRF model
        crf = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        crf.fit(X_train, y_train)

        # Predict and evaluate
        predictions = crf.predict(X_test)

        # Determine number of labels for each language
        if language in ['Malayalam', 'Tulu']:
            num_labels = 8  # 8 labels for Malayalam and Tulu
        else:
            num_labels = 7  # 7 labels for Kannada and Tamil

        print(f'F1 score on the test set for {language} = {metrics.flat_f1_score(y_test, predictions, average="weighted"):.4f}')
        print(f'Accuracy on the test set for {language} = {metrics.flat_accuracy_score(y_test, predictions):.4f}')

    except Exception as e:
        print(f"An error occurred while processing the {language} dataset: {str(e)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Processing Tamil dataset...
F1 score on the test set for Tamil = 0.8952
Accuracy on the test set for Tamil = 0.8962

Processing Malayalam dataset...
F1 score on the test set for Malayalam = 0.8628
Accuracy on the test set for Malayalam = 0.8647

Processing Tulu dataset...
F1 score on the test set for Tulu = 0.8670
Accuracy on the test set for Tulu = 0.8704

Processing Kannada dataset...
F1 score on the test set for Kannada = 0.9426
Accuracy on the test set for Kannada = 0.9436


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

!pip install sklearn_crfsuite

import pandas as pd
import os
import numpy as np
import string
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

# Define function to read and format data from CSV files
def read_and_format_csv(file_path):
    sents = []
    sent = []

    # Read the CSV file into a DataFrame
    data = pd.read_csv(file_path)

    # Assuming the CSV has two columns: 'Word' and 'Tag'
    for _, row in data.iterrows():
        word = row['Word']
        tag = row['Tag']

        # Check if the word and tag are valid (not NaN or None)
        if pd.isna(word) or pd.isna(tag):
            continue

        # Check if word is '.' to signify the end of a sentence
        if word == '.':
            if sent:
                sents.append(sent)
                sent = []
        else:
            sent.append((str(word), str(tag)))  # Ensure word and tag are strings

    # Append the last sentence if it exists
    if sent:
        sents.append(sent)

    return sents

# Feature extraction functions
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.ispunctuation': word in string.punctuation,
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1l = sent[i-1][0]
        features.update({
            '-1:word': word1l,
            '-1:len(word)': len(word1l),
            '-1:word[:3]': word1l[:3],
            '-1:word[:2]': word1l[:2],
            '-1:word[-3:]': word1l[-3:],
            '-1:word[-2:]': word1l[-2:],
            '-1:word.isdigit()': word1l.isdigit(),
            '-1:word.ispunctuation': word1l in string.punctuation,
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2l = sent[i-2][0]
        features.update({
            '-2:word': word2l,
            '-2:len(word)': len(word2l),
            '-2:word[:3]': word2l[:3],
            '-2:word[:2]': word2l[:2],
            '-2:word[-3:]': word2l[-3:],
            '-2:word[-2:]': word2l[-2:],
            '-2:word.isdigit()': word2l.isdigit(),
            '-2:word.ispunctuation': word2l in string.punctuation,
        })

    if i < len(sent) - 1:
        word1r = sent[i+1][0]
        features.update({
            '+1:word': word1r,
            '+1:len(word)': len(word1r),
            '+1:word[:3]': word1r[:3],
            '+1:word[:2]': word1r[:2],
            '+1:word[-3:]': word1r[-3:],
            '+1:word[-2:]': word1r[-2:],
            '+1:word.isdigit()': word1r.isdigit(),
            '+1:word.ispunctuation': word1r in string.punctuation,
        })
    else:
        features['EOS'] = True

    if i < len(sent) - 2:
        word2r = sent[i+2][0]
        features.update({
            '+2:word': word2r,
            '+2:len(word)': len(word2r),
            '+2:word[:3]': word2r[:3],
            '+2:word[:2]': word2r[:2],
            '+2:word[-3:]': word2r[-3:],
            '+2:word[-2:]': word2r[-2:],
            '+2:word.isdigit()': word2r.isdigit(),
            '+2:word.ispunctuation': word2r in string.punctuation,
        })

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

# Updated datasets with correct file paths
datasets = {
    "Tamil": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tamil_dataset.csv",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tamil_validation"
    },
    "Malayalam": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /Final_mal_train(80%)  (1).csv",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /Final_mal_dev(20%) (1).csv"
    },
    "Tulu": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tulu_train_set",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tulu_validation_set"
    },
    "Kannada": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_kannada_train",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_kannada_validation"
    }
}

for language, paths in datasets.items():
    print(f"\nProcessing {language} dataset...")

    # Check if the file paths exist
    if not os.path.exists(paths['train']) or not os.path.exists(paths['validation']):
        print(f"File paths for {language} dataset are incorrect or files do not exist.")
        continue

    try:
        # Read and format the data using the CSV-specific function
        train_sents = read_and_format_csv(paths['train'])
        test_sents = read_and_format_csv(paths['validation'])

        # Check if the datasets are loaded properly
        if not train_sents or not test_sents:
            print(f"Failed to load data for {language}. Check the data format.")
            continue

        # Prepare features and labels for training and testing
        X_train = [sent2features(s) for s in train_sents]
        y_train = [sent2labels(s) for s in train_sents]
        X_test = [sent2features(s) for s in test_sents]
        y_test = [sent2labels(s) for s in test_sents]

        # Train CRF model
        crf = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        crf.fit(X_train, y_train)

        # Predict and evaluate
        predictions = crf.predict(X_test)

        print(f'F1 score on the test set for {language} = {metrics.flat_f1_score(y_test, predictions, average="weighted"):.4f}')
        print(f'Accuracy on the test set for {language} = {metrics.flat_accuracy_score(y_test, predictions):.4f}')

        # Generate a classification report
        report = metrics.flat_classification_report(y_test, predictions, digits=3)
        print(f"Classification report for {language}:\n{report}")

    except Exception as e:
        print(f"An error occurred while processing the {language} dataset: {str(e)}")


Mounted at /content/drive
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn_crfsuite-0.5.0

Processing Tamil dataset...
F1 score on the test set for Tamil = 0.8952
Accuracy on the test set for Tamil = 0.8962
Classification report for Tamil:
              precision    recall  f1-score   support

       Other      0.000     0.000     0.000         1
          en   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 score on the test set for Malayalam = 0.8628
Accuracy on the test set for Malayalam = 0.8647
Classification report for Malayalam:
              precision    recall  f1-score   support

     ENGLISH      0.928     0.881     0.904      2229
   MALAYALAM      0.921     0.952     0.936      4371
       MIXED      0.752     0.437     0.553       375
        NAME      0.633     0.796     0.705       504
      NUMBER      0.989     0.887     0.935       203
       OTHER      0.519     0.563     0.540       641
       PLACE      0.917     0.349     0.506        63
         SYM      0.000     0.000     0.000         2

    accuracy                          0.865      8388
   macro avg      0.707     0.608     0.635      8388
weighted avg      0.869     0.865     0.863      8388


Processing Tulu dataset...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 score on the test set for Tulu = 0.8670
Accuracy on the test set for Tulu = 0.8704
Classification report for Tulu:
              precision    recall  f1-score   support

     English      0.938     0.916     0.927       742
     Kannada      0.738     0.692     0.715       273
    Location      0.917     0.805     0.857        41
       Mixed      0.871     0.474     0.614        57
        Name      0.830     0.689     0.753       135
       Other      0.768     0.624     0.688        85
        Tulu      0.866     0.938     0.901      1251

    accuracy                          0.870      2584
   macro avg      0.847     0.734     0.779      2584
weighted avg      0.869     0.870     0.867      2584


Processing Kannada dataset...
F1 score on the test set for Kannada = 0.9426
Accuracy on the test set for Kannada = 0.9436
Classification report for Kannada:
              precision    recall  f1-score   support

          en      0.959     0.985     0.972      1109
          kn      