In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
from snowballstemmer import NepaliStemmer

In [None]:
# Load the dataset
df = pd.read_csv('dataset/output/merged_data.csv')
df.sample(5)

# Function to load stopwords and punctuation words from a file
def load_stopwords(file_path):
    with open(file_path, encoding='utf-8') as fp:
        return list(map(lambda x: x.strip(), fp.readlines()))

def load_punctuation(file_path):
    with open(file_path, encoding='utf-8') as fp:
        return list(map(lambda x: x.strip(), fp.readlines()))

# Preprocess the text
def preprocess_text(data, stop_words, punctuation_words, stemmer, noise_chars):
    clean_text = []
    for row in tqdm(data, desc="Preprocessing text data", unit="text"):
        words = row.strip().split(" ")
        nwords = []
        for word in words:
            # remove anything other than nepali characters
            word = "".join([char for char in word if '\u0900' <= char <= '\u097F'])
            if word not in punctuation_words and word not in stop_words:
                word = stemmer.stemWord(word)
                if not any(noise in word for noise in noise_chars) and len(word) > 1:
                    word = word.replace("(", "").replace(")", "")
                    nwords.append(word)
        clean_text.append(" ".join(nwords))
    return clean_text

# Create the model pipeline
def create_model(ngram_range=(1, 2), alpha=1.0, class_prior=None, model_type='naive_bayes'):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    tfidf_transformer = TfidfTransformer()

    if model_type == 'naive_bayes':
        model = MultinomialNB(alpha=alpha, class_prior=class_prior)
    elif model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000)
    else:
        raise ValueError("Unknown model type")

    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('tfidf', tfidf_transformer),
        ('classifier', model)
    ])
    return pipeline

classes = {c: i for i, c in enumerate(df['class'].unique())}

# Hyperparameter tuning using GridSearchCV
def tune_hyperparameters(X_train, Y_train, pipeline):
    param_grid = {
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'classifier__alpha': [0.1, 0.5, 1.0, 2.0] if isinstance(pipeline.named_steps['classifier'], MultinomialNB) else [0.001, 0.01, 0.1],
    }
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, Y_train)
    print("Best parameters found: ", grid_search.best_params_)
    return grid_search.best_estimator_

# Model fitting and evaluation
def train_and_evaluate(X_train, Y_train, X_test, Y_test, pipeline):
    pipeline.fit(X_train, Y_train)
    y_pred = pipeline.predict(X_test)

    print(classification_report(Y_test, y_pred))
    cm = confusion_matrix(Y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes.keys(), yticklabels=classes.keys())
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()

    return pipeline

# Save model and vectorizer to files
def save_model(model, model_filename, vectorizer_filename):
    pickle.dump(model, open(model_filename, 'wb'))
    pickle.dump(model.named_steps['vectorizer'], open(vectorizer_filename, 'wb'))

# Load stopwords and punctuation words
stop_words = load_stopwords("nepali_stopwords.txt")
punctuation_words = load_punctuation("nepali_punctuation.txt")

# Create a stemmer
stemmer = NepaliStemmer()

# Define noise characters (numbers)
noise_chars = "1,2,3,4,5,6,7,8,9,0,०,१,२,३,४,५,६,७,८,९".split(",")

# Preprocess text data
data = pd.DataFrame()
data["text"] = preprocess_text(df["text"], stop_words, punctuation_words, stemmer, noise_chars)
data["label"] = df["class"]

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

# Create the model pipeline
pipeline = create_model(model_type='naive_bayes')  # You can also experiment with 'logistic_regression'

# Apply resampling (SMOTE) only on the vectorized data
smote = SMOTE(sampling_strategy='auto')

# Apply vectorizer and SMOTE after feature extraction
X_train_vectorized = pipeline.named_steps['vectorizer'].fit_transform(X_train)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train_vectorized, Y_train)

# Tune hyperparameters
best_pipeline = tune_hyperparameters(X_train_resampled, Y_train_resampled, pipeline)

# Train and evaluate the model
model = train_and_evaluate(X_train_resampled, Y_train_resampled, X_test, Y_test, best_pipeline)

# Save the trained model and vectorizer
save_model(model, 'news_pred_model_v1.pickle', 'news_pred_vectorizer_v1.pickle')

# Example of predicting new text
new_text = """
काठमाडौँ — नेपाली यू–१९ महिला क्रिकेट टोलीले आईसीसी यू–१९ महिला ट्वान्टी–२० विश्वकपको एसिया छनोटमा तेस्रो जित हात पारेको छ । नेपालले यूएईमा भइरहेको प्रतियोगितामा आइतबार थाइल्यान्डलाई १ सय ६ रनले पराजित गर्‍यो ।
"""
prediction = model.predict([new_text])
print("Predicted category:", prediction)


Preprocessing text data: 100%|██████████| 4890/4890 [00:24<00:00, 203.43text/s]


ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\feature_extraction\text.py", line 1372, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\feature_extraction\text.py", line 1259, in _count_vocab
    for feature in analyze(doc):
                   ^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\feature_extraction\text.py", line 108, in _analyze
    doc = preprocessor(doc)
          ^^^^^^^^^^^^^^^^^
  File "c:\Users\Suyash Shrestha\AppData\Local\pypoetry\Cache\virtualenvs\news-title-classifier-Q5gfBSGz-py3.12\Lib\site-packages\sklearn\feature_extraction\text.py", line 66, in _preprocess
    doc = doc.lower()
          ^^^^^^^^^
AttributeError: 'csr_matrix' object has no attribute 'lower'. Did you mean: 'power'?
