<a href="https://colab.research.google.com/github/rinshan-bot/natutal-language-processing/blob/main/Word_Representations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Embedding, Dense

# Suppose we have the following data for training NER
sentences = ["John lives in New York City", "Mary works at a company in London"]
labels = ["PERSON", "LOCATION", "LOCATION"]

# Suppose we have pre-trained word embeddings and Brown clusters
word_embeddings = {}  # Pre-trained word embeddings
brown_clusters = {}  # Pre-trained Brown clusters

# Define the maximum path prefix sizes for Brown clusters
path_prefix_sizes = [4, 6, 10, 20]

# Prepare input features
features = []
for sentence in sentences:
    sentence_features = []
    for word in sentence.split():
        # Get word embedding feature
        if word in word_embeddings:
            word_embedding_feature = word_embeddings[word]
        else:
            word_embedding_feature = np.zeros(300)  # Use zero vector for out-of-vocabulary words
        sentence_features.append(word_embedding_feature)

        # Get Brown cluster features
        for prefix_size in path_prefix_sizes:
            path_prefix = get_path_prefix(word, prefix_size)
            if path_prefix in brown_clusters:
                cluster_feature = brown_clusters[path_prefix]
            else:
                cluster_feature = np.zeros(12)  # Use zero vector for out-of-cluster words
            sentence_features.append(cluster_feature)

    features.append(sentence_features)

# Convert features to numpy arrays
features = np.array(features)

# One-hot encode labels
label_encoder = OneHotEncoder(sparse=False)
labels_encoded = label_encoder.fit_transform(np.array(labels).reshape(-1, 1))

# Define the neural network architecture
model = Sequential()
model.add(Embedding(input_dim=features.shape[1], output_dim=128, input_length=features.shape[1]))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=label_encoder.categories_[0].shape[0], activation='softmax'))

# Compile and train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(features, labels_encoded, epochs=10, batch_size=32)

# Predict on new data
new_sentences = ["Peter is visiting Paris"]
new_features = []
for sentence in new_sentences:
    new_sentence_features = []
    for word in sentence.split():
        # Get word embedding feature
        if word in word_embeddings:
            word_embedding_feature = word_embeddings[word]
        else:
            word_embedding_feature = np.zeros(300)
        new_sentence_features.append(word_embedding_feature)

        # Get Brown cluster features
        for prefix_size in path_prefix_sizes:
            path_prefix = get_path_prefix(word, prefix_size)
            if path_prefix in brown_clusters:
                cluster_feature = brown_clusters[path_prefix]
            else:
                cluster_feature = np.zeros(12)
            new_sentence_features.append(cluster_feature)

    new_features.append(new_sentence_features)

new_features = np.array(new_features)
predictions = model.predict(new_features)

# Decode predicted labels
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))
print(predicted_labels)


NameError: ignored