In [None]:
import pandas as pd
import spacy
import nltk
import random
import string
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from spacy.training import Example
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
import uvicorn
import jwt

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load Spacy model
nlp = spacy.blank("en")

# Load dataset (assuming CoNLL-2003 format)
def load_data(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        sentence, entities = [], []
        for line in file:
            line = line.strip()
            if line:
                parts = line.split()
                word, tag = parts[0], parts[-1]
                sentence.append(word)
                entities.append(tag)
            else:
                if sentence:
                    data.append((sentence, entities))
                    sentence, entities = [], []
    return data

# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and word not in string.punctuation]

# Convert dataset into Spacy format
def convert_to_spacy_format(data):
    formatted_data = []
    for sentence, entities in data:
        text = " ".join(sentence)
        ents = []
        offset = 0
        for word, label in zip(sentence, entities):
            if label != 'O':
                start = text.find(word, offset)
                end = start + len(word)
                ents.append((start, end, label))
                offset = end
        formatted_data.append((text, {"entities": ents}))
    return formatted_data

# Load and process dataset
data_path = "train"
data = load_data(data_path)
preprocessed_data = convert_to_spacy_format(data)

# Split dataset
train_data, test_data = train_test_split(preprocessed_data, test_size=0.2, random_state=42)

# Train NER model
def train_ner_model(train_data, n_iter=10):
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    for _, annotations in train_data:
        for ent in annotations["entities"]:
            ner.add_label(ent[2])
    
    optimizer = nlp.begin_training()
    for _ in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Losses: {losses}")
    
    return nlp

# Train the model
trained_nlp = train_ner_model(train_data)

# Save the trained model
model_path = "ner_model.pkl"
with open(model_path, "wb") as model_file:
    pickle.dump(trained_nlp, model_file)
print(f"Model saved to {model_path}")
