In [None]:
from datasets import load_dataset
dataset = load_dataset("eriktks/conll2003", trust_remote_code=True)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [16]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer

# Mapping of NER tags as per CoNLL-2003
# These tags are typically:
# 0: O
# 1: B-PER
# 2: I-PER
# 3: B-ORG
# 4: I-ORG
# 5: B-LOC
# 6: I-LOC
# 7: B-MISC
# 8: I-MISC
ner_tag_map = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC'
}

# Function to map numerical NER tags to BIO labels
def map_ner_tags(dataset_split):
    for example in dataset_split:
        example['ner_tags'] = [ner_tag_map[tag] for tag in example['ner_tags']]
    return dataset_split

# Apply mapping to all splits
dataset['train'] = map_ner_tags(dataset['train'])
dataset['validation'] = map_ner_tags(dataset['validation'])
dataset['test'] = map_ner_tags(dataset['test'])

# Function to extract features from a token
def extract_features(tokens, pos_tags, chunk_tags, idx):
    token = tokens[idx]
    pos = pos_tags[idx]
    chunk = chunk_tags[idx]
    
    features = {
        'token': token,
        'pos': pos,
        'chunk': chunk,
    }
    
    # Previous token features
    if idx > 0:
        features['prev_token'] = tokens[idx - 1]
        features['prev_pos'] = pos_tags[idx - 1]
    else:
        features['prev_token'] = 'BOS'  # Beginning of sentence
        features['prev_pos'] = 'BOS'
        
    # Next token features
    if idx < len(tokens) - 1:
        features['next_token'] = tokens[idx + 1]
        features['next_pos'] = pos_tags[idx + 1]
    else:
        features['next_token'] = 'EOS'  # End of sentence
        features['next_pos'] = 'EOS'
    
    return features

# Prepare the dataset for training
def prepare_data(dataset_split):
    X = []
    y = []
    for example in dataset_split:
        tokens = example['tokens']
        pos_tags = example['pos_tags']
        chunk_tags = example['chunk_tags']
        ner_tags = example['ner_tags']
        for idx in range(len(tokens)):
            X.append(extract_features(tokens, pos_tags, chunk_tags, idx))
            y.append(ner_tags[idx])
    return X, y


In [18]:

# Prepare training data
X_train, y_train = prepare_data(dataset['train'])
X_val, y_val = prepare_data(dataset['validation'])
X_test, y_test = prepare_data(dataset['test'])

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Vectorize the features
vectorizer = DictVectorizer(sparse=True)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize the SVM classifier
# Using a linear kernel for efficiency; you can experiment with other kernels
classifier = svm.LinearSVC()

# Train the classifier
print("Training the SVM classifier...")
classifier.fit(X_train_vectorized, y_train_encoded)
print("Training completed.")

# Predict on the validation set
print("Predicting on the validation set...")
y_val_pred = classifier.predict(X_val_vectorized)

# Decode the labels
y_val_pred_labels = label_encoder.inverse_transform(y_val_pred)
y_val_true_labels = label_encoder.inverse_transform(y_val_encoded)

# Evaluate the model
print("Evaluation on Validation Set:")
print(classification_report(y_val_true_labels, y_val_pred_labels))


Training the SVM classifier...
Training completed.
Predicting on the validation set...
Evaluation on Validation Set:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     42759
           1       0.97      0.73      0.83      1842
           2       0.96      0.80      0.88      1307
           3       0.90      0.73      0.81      1341
           4       0.90      0.70      0.79       751
           5       0.93      0.81      0.87      1837
           6       0.91      0.79      0.84       257
           7       0.93      0.80      0.86       922
           8       0.88      0.66      0.75       346

    accuracy                           0.96     51362
   macro avg       0.93      0.78      0.85     51362
weighted avg       0.96      0.96      0.96     51362



In [19]:
# Predict on the test set
print("Predicting on the test set...")
y_test_pred = classifier.predict(X_test_vectorized)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)
y_test_true_labels = label_encoder.inverse_transform(y_test_encoded)


# Evaluate the model
print("Evaluation on Test Set:")
print(classification_report(y_test_true_labels, y_test_pred_labels))


Predicting on the test set...
Evaluation on Test Set:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     38323
           1       0.95      0.57      0.71      1617
           2       0.94      0.69      0.79      1156
           3       0.85      0.63      0.72      1661
           4       0.84      0.64      0.73       835
           5       0.89      0.76      0.82      1668
           6       0.80      0.67      0.73       257
           7       0.83      0.72      0.77       702
           8       0.68      0.64      0.66       216

    accuracy                           0.94     46435
   macro avg       0.86      0.70      0.77     46435
weighted avg       0.93      0.94      0.93     46435



In [55]:

# Corrected format_output function
def format_output(dataset_split, predictions):
    formatted_sentences = []
    current_index = 0  # Pointer to track the position in predictions
    
    for example in dataset_split:
        tokens = example['tokens']
        num_tokens = len(tokens)

        # Extract the predictions for the current sentence
        pred = predictions[current_index:current_index + num_tokens]
        current_index += num_tokens  # Move the pointer forward
        
        tagged_tokens = []
        for token, p in zip(tokens, pred):
            tag = ner_tag_map[p]
            if tag == 'O':
                tagged_tokens.append(token)
            else:
                # Split the tag into BIO and entity type
                try:
                    bio, entity = tag.split('-')
                    tagged_tokens.append(f"{token}_{bio}")
                except ValueError:
                    # Handle cases where the tag might not follow the expected format
                    tagged_tokens.append(token)
        
        formatted_sentence = ' '.join(tagged_tokens)
        formatted_sentences.append(formatted_sentence)
    
    return formatted_sentences

# Format the test set predictions using the corrected function
formatted_test_output = format_output(dataset['test'], y_test_pred_labels)

# Display some examples
for i in range(5):
    print(f"Input: {' '.join(dataset['test'][i]['tokens'])}")
    print(f"Output: {formatted_test_output[i]}")
    print()

Input: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Output: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .

Input: Nadim Ladki
Output: Nadim Ladki

Input: AL-AIN , United Arab Emirates 1996-12-06
Output: AL-AIN , United_B Arab_I Emirates_I 1996-12-06

Input: Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
Output: Japan_B began the defence of their Asian_B Cup_I title with a lucky 2-1 win against Syria_B in a Group C championship match on Friday .

Input: But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .
Output: But China_B saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .

