<a href="https://colab.research.google.com/github/rafiqulcse/Natural-Language-Processing-Project/blob/main/NER_with_CNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
#sentences = ["Apple is a tech company.", "New York is a big city.", "New York is a big city.", "New York is a big city.", "New York is a big city."]  # List of sentences
#labels = ["ORG", "LOC", "LOC", "LOC", "LOC"]

In [3]:
github_url = "https://raw.githubusercontent.com/rafiqulcse/Natural-Language-Processing-Project/main/Dataset/NER_Dataset.csv"

df = pd.read_csv(github_url)
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [4]:
sentences = []
labels = []

for i in range(len(df)):
  first_element = str(df.iloc[i][0])
  second_element = str(df.iloc[i][0])

  sentences.append(first_element)
  labels.append(second_element)

In [5]:
# Tokenization and preprocessing
tokenizer = tf.keras.layers.TextVectorization(output_sequence_length=50)  # You can adjust the sequence length
tokenizer.adapt(sentences)
sequences = tokenizer(sentences).numpy()

In [6]:
# Label encoding
label_set = set(labels)
label2id = {label: idx for idx, label in enumerate(label_set)}
id2label = {idx: label for label, idx in label2id.items()}
label_ids = [label2id[label] for label in labels]

In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(sequences, label_ids, test_size=0.2, random_state=42)

In [8]:
# CNN-based NER model
model = Sequential([
    Embedding(input_dim=len(tokenizer.get_vocabulary()), output_dim=32, input_length=50),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_set), activation='softmax')
])

In [9]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            3712      
                                                                 
 conv1d (Conv1D)             (None, 46, 128)           20608     
                                                                 
 max_pooling1d (MaxPooling1  (None, 23, 128)           0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 2944)              0         
                                                                 
 dense (Dense)               (None, 64)                188480    
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                        

In [10]:
import numpy as np

# Convert your lists to NumPy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
batch_size = 32
epochs = 10

# Then, you can use model.fit as before
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")


Test loss: 0.9170, Test accuracy: 0.7990
