In [None]:
#NB

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
from sklearn.naive_bayes import GaussianNB
import pickle
import nltk


nltk.download('punkt')


try:
    df = pd.read_csv('train_final.csv')
except FileNotFoundError:
    raise FileNotFoundError("The file 'train_final.csv' was not found.")


assert 'text' in df.columns, "'text' column is missing in the CSV"
assert 'label' in df.columns, "'label' column is missing in the CSV"


df['text'].fillna('', inplace=True)


X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


print("Training set class distribution:")
print(y_train.value_counts())

print("Test set class distribution:")
print(y_test.value_counts())


train_sentences = [word_tokenize(str(sentence)) for sentence in X_train]


word2vec_model = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)


def document_vector(model, doc):
    doc = [word for word in word_tokenize(str(doc)) if word in model.wv]
    if len(doc) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)


X_train_vectors = np.array([document_vector(word2vec_model, doc) for doc in X_train])
X_test_vectors = np.array([document_vector(word2vec_model, doc) for doc in X_test])


num_zero_vectors = np.sum([np.all(vec == 0) for vec in X_train_vectors])
print(f"Number of zero vectors in training data: {num_zero_vectors}")

print(f"Mean of training vectors: {np.mean(X_train_vectors)}")
print(f"Standard deviation of training vectors: {np.std(X_train_vectors)}")


nb = GaussianNB()
nb.fit(X_train_vectors, y_train)


y_pred = nb.predict(X_test_vectors)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


try:
    word2vec_model.save("word2vec_modeltrialnb.model")
    with open("classifier.pkl", "wb") as f:
        pickle.dump(nb, f)
except Exception as e:
    print(f"Error saving models: {e}")


try:
    word2vec_model = Word2Vec.load("word2vec_modeltrialnb.model")
    with open("classifier.pkl", "rb") as f:
        nb = pickle.load(f)
except Exception as e:
    print(f"Error loading models: {e}")


new_data = ["Yo! my doc asked me to take some medicines for fever",
            "I hate him"]
new_data_vectors = np.array([document_vector(word2vec_model, doc) for doc in new_data])
new_predictions = nb.predict(new_data_vectors)

print("Predictions:", new_predictions)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\green\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
C:\Users\green\AppData\Local\Temp\ipykernel_17720\76659239.py:27: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna('', inplace=True)
Training set class distribution:
label
1    2340
0     719
2     557
Name: count, dtype: int64
Test set class distribution:
label
1        574
0        193
2        137
label      1
Name: count, dtype: int64
Number of zero vectors in training data: 1
Mean of training vectors: -0.00650674002119469
Standard deviation of training vectors: 0.4803362452375843
Accuracy: 0.33480662983425413
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.81      0.38       193
           1       0.77      0.18      0.29       574
           2       0.32      0.33      0.32       137
       label       0.00      0.00      0.00         1
...
   macro avg       0.33      0.33      0.25       905
weighted avg       0.59      0.33      0.31       905

Predictions: ['0' '0']
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
c:\Users\green\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\green\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\green\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

In [None]:
from gensim.models import Word2Vec
import pickle


word2vec_model = Word2Vec.load("word2vec_modeltrialdt.model")


with open("classifier.pkl", "rb") as f:
    clf = pickle.load(f)


In [None]:

def document_vector(model, doc):
    doc = [word for word in word_tokenize(str(doc)) if word in model.wv]
    if len(doc) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)


new_data = [
    "Yo! my doc asked me to take some medicines for fever",
    "i hate him for real"
    
]


new_data_vectors = np.array([document_vector(word2vec_model, doc) for doc in new_data])


new_predictions = clf.predict(new_data_vectors)

print("Predictions:", new_predictions)


Predictions: ['2' '2']

In [None]:
try:
    word2vec_model = Word2Vec.load("word2vec_modeltrialnb.model")
    with open("classifier.pkl", "rb") as f:
        nb = pickle.load(f)
except Exception as e:
    print(f"Error loading models: {e}")


new_data = ["i hate her",
            "You almost gave me a heart attack"]
new_data_vectors = np.array([document_vector(word2vec_model, doc) for doc in new_data])
new_predictions = nb.predict(new_data_vectors)

print("Predictions:", new_predictions)

Predictions: ['0' '2']