# Naive Bayes Model

### Import the data and prepare it for training

In [13]:
import sys
sys.path.append('../backend')
from backend import assemblage
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer

data = assemblage.load_data("../data/train_cleaned.txt")

### Train the Naive Bayes model

In [14]:
X = []  # Feature dictionaries
y = []  # Labels
for sentence, labels in data:
    tokens = sentence.split()
    for idx, label in enumerate(labels):
        X.append(assemblage.extract_features_for_DT_NB(tokens, idx))
        y.append(label)


# Convert features to a format suitable for scikit-learn
vectorizer = DictVectorizer(sparse=True)
X_vectorized = vectorizer.fit_transform(X)

# Train the Decision Tree model
clf = MultinomialNB()
clf.fit(X_vectorized, y)

print("Model trained successfully.")

Model trained successfully.


### Evaluate the model

In [15]:
from sklearn.metrics import recall_score, precision_score , f1_score

data_test = assemblage.load_data("../data/test_cleaned.txt")
data_val = assemblage.load_data("../data/val_cleaned.txt")

X_test = []  # Feature dictionaries
y_test = []  # Labels
for sentence, labels in data_test:
    tokens = sentence.split()
    for idx, label in enumerate(labels):
        X_test.append(assemblage.extract_features_for_DT_NB(tokens, idx))
        y_test.append(label)

X_val = []  # Feature dictionaries
y_val = []  # Labels
for sentence, labels in data_val:
    tokens = sentence.split()
    for idx, label in enumerate(labels):
        X_val.append(assemblage.extract_features_for_DT_NB(tokens, idx))
        y_val.append(label)
        
X_test_vectorized = vectorizer.transform(X_test)
X_val_vectorized = vectorizer.transform(X_val)

# Evaluate the model
train_predictions = clf.predict(X_vectorized)
train_accuracy = clf.score(X_vectorized, y)
train_recall = recall_score(y, train_predictions, average='macro', zero_division=0)
train_precision = precision_score(y,train_predictions, average='macro', zero_division=0)
train_f1 = f1_score(y,train_predictions, average='macro', zero_division=0)

test_predictions = clf.predict(X_test_vectorized)
test_accuracy = clf.score(X_test_vectorized, y_test)
test_recall = recall_score(y_test, test_predictions, average='macro', zero_division=0)
test_precision = precision_score(y_test,test_predictions, average='macro', zero_division=0)
test_f1 = f1_score(y_test,test_predictions, average='macro', zero_division=0)

val_predictions = clf.predict(X_val_vectorized)
val_accuracy = clf.score(X_val_vectorized, y_val)
val_recall = recall_score(y_val, val_predictions, average='macro', zero_division=0)
val_precision = precision_score(y_val,val_predictions, average='macro', zero_division=0)
val_f1 = f1_score(y_val,val_predictions, average='macro', zero_division=0)

print("Train Accuracy:", train_accuracy)
print("Train Recall:", train_recall)
print("Train Precision:", train_precision)
print("Train F1:", train_f1)

print("\n\nTest Accuracy:", test_accuracy)
print("Test Recall:", test_recall)
print("Test Precision:", test_precision)
print("Test F1:", test_f1)

print("\n\nValidation Accuracy:", val_accuracy)
print("Validation Recall:", val_recall)
print("Validation Precision:", val_precision)
print("Validation F1:", val_f1)

Train Accuracy: 0.8129811405684939
Train Recall: 0.5032359495309404
Train Precision: 0.5503921096271913
Train F1: 0.4762517205468807


Test Accuracy: 0.8102440096724555
Test Recall: 0.4808387415465718
Test Precision: 0.47164107148267054
Test F1: 0.4567870870568938


Validation Accuracy: 0.8141777430284076
Validation Recall: 0.48862108141621924
Validation Precision: 0.4930946666029553
Validation F1: 0.4681748860237034


### Save the model

In [16]:
import joblib
joblib.dump(clf, "../model/naive_bayes_ner_model.joblib")
joblib.dump(vectorizer, "../model/naive_bayes_vectorizer.joblib")

print("Model and vectorizer saved successfully.")

# Load the model and vectorizer
clf_loaded = joblib.load("../model/naive_bayes_ner_model.joblib")
vectorizer_loaded = joblib.load("../model/naive_bayes_vectorizer.joblib")

# Test the model on a new sentence
def predict_entities(sentence, clf, vectorizer):
    tokens = sentence.split()
    features = [assemblage.extract_features_for_DT_NB(tokens, idx) for idx in range(len(tokens))]
    features_vectorized = vectorizer.transform(features)
    predictions = clf.predict(features_vectorized)
    return list(zip(tokens, predictions))

# Example sentence
test_sentence = "مرحبا انا اسمي قصي وانا اعيش في الاردن"
predictions = predict_entities(test_sentence, clf_loaded, vectorizer_loaded)

# Print the predictions
print("Predicted Entities:")
for token, label in predictions:
    print(f"{token}: {label}")

Model and vectorizer saved successfully.
Predicted Entities:
مرحبا: O
انا: O
اسمي: O
قصي: B-PER
وانا: O
اعيش: O
في: O
الاردن: B-LOC
