In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'inshorts-dataset-hindi:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3758343%2F9253476%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241012%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241012T175143Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D801a8c527094f53e042b611e6df7c08a19fa9e8b971968fe1122c6dbef1a3a56f57e2338b4104c15a6356f376b0ec281225f34c80970807df1ed2c0b1f0513317fb988520bab02a1611253802d4c1a11c1b671e524a4a830edb3d0fd79659e24badbd6f11c159a62c95febc3f25bde708d9b88c590c2c44847b0ab4cf2ffe859e1b09b3de75a3d36d9a332dfa85fa2e4c89fe0ce28e9ef18f30783e2f1698d4a67dbe404dfaed614dec7b76b3c2ac78e48a0ec44f67b8ea45058f0f87c514f8754c3411093a30be5716adf9332aa5fe1b99cb37b1f1f3757eb4e148748a1f47325d9953f9f47cd71ed4db8eb0da88227ed93aee53057700ee9a3bf100e4c08bd'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading inshorts-dataset-hindi, 40960800 bytes compressed
Downloaded and uncompressed: inshorts-dataset-hindi
Data source import complete.


# New section

In [None]:
!pip install indic-nlp-library
!pip install datasets




In [None]:
# Imports for tokenization, stopword removal, and one-hot encoding
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from indicnlp.tokenize.sentence_tokenize import sentence_split
from indicnlp.tokenize import indic_tokenize

In [None]:
# Load the dataset
df = pd.read_csv("/kaggle/input/inshorts-dataset-hindi/hindi_news_dataset.csv")

# 1. Drop the 'Date' column as it's irrelevant
df = df.drop('Date', axis=1)

# Display the first few rows of the dataset
print(df.head())

                                            Headline  \
0  कांग्रेस नेता बलजिंदर सिंह की पंजाब में घर के ...   
1  केंद्रीय मंत्री बोले- महिला आरक्षण लाने का साह...   
2  ओपीएस लागू करने से अस्थिर हो सकती है राज्यों क...   
3  तमिलनाडु में शावरमा खाने से 14 वर्षीय छात्रा क...   
4  मणिपुर में मुख्यमंत्री के आश्वासन के बाद मारे ...   

                                             Content           News Categories  
0  कांग्रेस नेता बलजिंदर सिंह की सोमवार को पंजाब ...              ['national']  
1  केंद्रीय मंत्री प्रह्लाद पटेल ने लोकसभा और विध...  ['politics', 'national']  
2  आरबीआई के 5 अधिकारियों ने एक लेख में लिखा है क...  ['business', 'national']  
3  नामक्कल (तमिलनाडु) में शावरमा खाने से सोमवार क...              ['national']  
4  मणिपुर के मुख्यमंत्री एन बीरेन सिंह के आश्वासन...              ['national']  


In [None]:
# Create a new column 'Headline_Content' by concatenating 'Headline' and 'Content'
df['Headline_Content'] = df['Headline'] + ' ' + df['Content']

# Display the first few rows of the updated DataFrame
print(df['Headline_Content'].head())


0    कांग्रेस नेता बलजिंदर सिंह की पंजाब में घर के ...
1    केंद्रीय मंत्री बोले- महिला आरक्षण लाने का साह...
2    ओपीएस लागू करने से अस्थिर हो सकती है राज्यों क...
3    तमिलनाडु में शावरमा खाने से 14 वर्षीय छात्रा क...
4    मणिपुर में मुख्यमंत्री के आश्वासन के बाद मारे ...
Name: Headline_Content, dtype: object


In [None]:
# Step 1: Sentence Tokenization using IndicNLP
# Tokenizing 'Content' column into sentences
df['Sentences'] = df['Headline_Content'].apply(lambda text: sentence_split(text, lang='hi'))
print(df['Sentences'].head())


0    [कांग्रेस नेता बलजिंदर सिंह की पंजाब में घर के...
1    [केंद्रीय मंत्री बोले- महिला आरक्षण लाने का सा...
2    [ओपीएस लागू करने से अस्थिर हो सकती है राज्यों ...
3    [तमिलनाडु में शावरमा खाने से 14 वर्षीय छात्रा ...
4    [मणिपुर में मुख्यमंत्री के आश्वासन के बाद मारे...
Name: Sentences, dtype: object


In [None]:
# Step 2: Word Tokenization using IndicNLP
# Tokenizing each sentence into words
df['Words'] = df['Headline_Content'].apply(lambda text: indic_tokenize.trivial_tokenize(text))
print(df['Words'].head())


0    [कांग्रेस, नेता, बलजिंदर, सिंह, की, पंजाब, में...
1    [केंद्रीय, मंत्री, बोले, -, महिला, आरक्षण, लान...
2    [ओपीएस, लागू, करने, से, अस्थिर, हो, सकती, है, ...
3    [तमिलनाडु, में, शावरमा, खाने, से, 14, वर्षीय, ...
4    [मणिपुर, में, मुख्यमंत्री, के, आश्वासन, के, बा...
Name: Words, dtype: object


In [None]:
# Step 3: Stopword Removal using predefined stopword list
# Removing stopwords from tokenized words

with open('/content/final-stopwords-hi.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Assuming df is your DataFrame and 'Words' is the column with tokenized words
df['Filtered_Words'] = df['Words'].apply(lambda words: [word for word in words if word not in stop_words])

print(df['Filtered_Words'].head())

0    [कांग्रेस, नेता, बलजिंदर, सिंह, पंजाब, गोली, म...
1    [केंद्रीय, मंत्री, बोले, -, महिला, आरक्षण, लान...
2    [ओपीएस, लागू, अस्थिर, राज्यों, वित्तीय, स्थिति...
3    [तमिलनाडु, शावरमा, खाने, 14, वर्षीय, छात्रा, म...
4    [मणिपुर, मुख्यमंत्री, आश्वासन, मारे, सैनिक, शव...
Name: Filtered_Words, dtype: object


In [None]:
# Step 4: One-Hot Encoding and Printing Encodings
# Ensure the 'News Categories' are in list format
df['News Categories'] = df['News Categories'].apply(lambda x: eval(x))  # Convert string to list if needed

# Apply MultiLabelBinarizer for one-hot encoding of 'News Categories'
mlb = MultiLabelBinarizer()
one_hot_categories = mlb.fit_transform(df['News Categories'])

# Create a new DataFrame for the encoded categories and concatenate with original DataFrame
encoded_categories_df = pd.DataFrame(one_hot_categories, columns=mlb.classes_)
df = pd.concat([df, encoded_categories_df], axis=1)

# Print the encoding for each category
category_encoding = dict(zip(mlb.classes_, range(len(mlb.classes_))))
print("Category Encodings:")
for category, encoding in category_encoding.items():
    print(f"{category}: {encoding}")


Category Encodings:
automobile: 0
business: 1
entertainment: 2
facts: 3
fashion: 4
hatke: 5
miscellaneous: 6
national: 7
politics: 8
sports: 9
startup: 10
technology: 11
travel: 12
world: 13
आईपीएल-_2024: 14
आईपीएल_2023: 15
आईपीएल_2024: 16
एक्सप्लेनर: 17
एजुकेशन: 18
एजुकेशन_और_जॉब: 19
एशियन_गेम्स_2022: 20
एशिया_कप_2023: 21
केंद्रीय_बजट_2023-24: 22
कोरोना_वायरस: 23
टी20_विश्व_कप_2024: 24
पेरिस_ओलंपिक्स: 25
फाइनेंस: 26
फील_गुड_स्टोरीज़: 27
बजट_2024: 28
लोकसभा_चुनाव-_2024: 29
लोकसभा_चुनाव_2024: 30
विधानसभा_चुनाव_2023: 31
विश्व_कप_2023: 32
हेल्थ_एंड_फिटनेस: 33


## Shrink Dataset and Print Dataset Size Before and After
# Convert 'News Categories' from list to tuple to make them hashable

In [None]:
# Step 5: Shrink Dataset and Print Dataset Size Before and After
# Convert 'News Categories' from list to tuple to make them hashable
df['News Categories Tuple'] = df['News Categories'].apply(lambda x: tuple(x))

# Find the top 20 most frequent category tuples
top_20_categories = df['News Categories Tuple'].value_counts().head(20).index

# Filter the DataFrame to only include rows with these top 20 category tuples
df_top_20 = df[df['News Categories Tuple'].apply(lambda x: x in top_20_categories)]

# Dataset size before and after shrinking
print(f"Dataset size before shrinking: {len(df)}")
print(f"Dataset size after shrinking: {len(df_top_20)}")

# Dataset size before shrinking
print(f"Dataset size before shrinking: {len(df)} and shape: {df.shape}")

# Dataset size after shrinking
print(f"Dataset size after shrinking: {len(df_top_20)} and shape: {df_top_20.shape}")

df = df_top_20

Dataset size before shrinking: 185512
Dataset size after shrinking: 157861
Dataset size before shrinking: 185512 and shape: (185512, 42)
Dataset size after shrinking: 157861 and shape: (157861, 42)


In [None]:
df[['Headline', 'Content', 'politics', 'business', 'national']].head()


Unnamed: 0,Headline,Content,politics,business,national
0,कांग्रेस नेता बलजिंदर सिंह की पंजाब में घर के ...,कांग्रेस नेता बलजिंदर सिंह की सोमवार को पंजाब ...,0,0,1
1,केंद्रीय मंत्री बोले- महिला आरक्षण लाने का साह...,केंद्रीय मंत्री प्रह्लाद पटेल ने लोकसभा और विध...,1,0,1
2,ओपीएस लागू करने से अस्थिर हो सकती है राज्यों क...,आरबीआई के 5 अधिकारियों ने एक लेख में लिखा है क...,0,1,1
3,तमिलनाडु में शावरमा खाने से 14 वर्षीय छात्रा क...,नामक्कल (तमिलनाडु) में शावरमा खाने से सोमवार क...,0,0,1
4,मणिपुर में मुख्यमंत्री के आश्वासन के बाद मारे ...,मणिपुर के मुख्यमंत्री एन बीरेन सिंह के आश्वासन...,0,0,1


In [None]:
print(df.columns)

print(df[['Headline', 'Content', 'politics', 'business', 'national']].head())

Index(['Headline', 'Content', 'News Categories', 'Headline_Content',
       'Sentences', 'Words', 'Filtered_Words', 'automobile', 'business',
       'entertainment', 'facts', 'fashion', 'hatke', 'miscellaneous',
       'national', 'politics', 'sports', 'startup', 'technology', 'travel',
       'world', 'आईपीएल-_2024', 'आईपीएल_2023', 'आईपीएल_2024', 'एक्सप्लेनर',
       'एजुकेशन', 'एजुकेशन_और_जॉब', 'एशियन_गेम्स_2022', 'एशिया_कप_2023',
       'केंद्रीय_बजट_2023-24', 'कोरोना_वायरस', 'टी20_विश्व_कप_2024',
       'पेरिस_ओलंपिक्स', 'फाइनेंस', 'फील_गुड_स्टोरीज़', 'बजट_2024',
       'लोकसभा_चुनाव-_2024', 'लोकसभा_चुनाव_2024', 'विधानसभा_चुनाव_2023',
       'विश्व_कप_2023', 'हेल्थ_एंड_फिटनेस', 'News Categories Tuple'],
      dtype='object')
                                      Filtered_Words News Categories Tuple
0  [कांग्रेस, नेता, बलजिंदर, सिंह, पंजाब, गोली, म...           (national,)
1  [केंद्रीय, मंत्री, बोले, -, महिला, आरक्षण, लान...  (politics, national)
2  [ओपीएस, लागू, अस्थिर, राज्यों, वि

In [None]:
# Specify the columns you want to drop
columns_to_drop = ['Headline', 'Content', 'Headline_Content', 'Sentences', 'Words', 'News Categories Tuple']

# Drop the specified columns in place
df.drop(columns=columns_to_drop, inplace=True)

# Display the first few rows of the updated DataFrame
print(df.head())

        News Categories                                     Filtered_Words  \
0            [national]  [कांग्रेस, नेता, बलजिंदर, सिंह, पंजाब, गोली, म...   
1  [politics, national]  [केंद्रीय, मंत्री, बोले, -, महिला, आरक्षण, लान...   
2  [business, national]  [ओपीएस, लागू, अस्थिर, राज्यों, वित्तीय, स्थिति...   
3            [national]  [तमिलनाडु, शावरमा, खाने, 14, वर्षीय, छात्रा, म...   
4            [national]  [मणिपुर, मुख्यमंत्री, आश्वासन, मारे, सैनिक, शव...   

   automobile  business  entertainment  facts  fashion  hatke  miscellaneous  \
0           0         0              0      0        0      0              0   
1           0         0              0      0        0      0              0   
2           0         1              0      0        0      0              0   
3           0         0              0      0        0      0              0   
4           0         0              0      0        0      0              0   

   national  ...  टी20_विश्व_कप_2024  पेरिस_ओलंपिक

In [None]:
print(df.columns)

Index(['News Categories', 'Filtered_Words', 'automobile', 'business',
       'entertainment', 'facts', 'fashion', 'hatke', 'miscellaneous',
       'national', 'politics', 'sports', 'startup', 'technology', 'travel',
       'world', 'आईपीएल-_2024', 'आईपीएल_2023', 'आईपीएल_2024', 'एक्सप्लेनर',
       'एजुकेशन', 'एजुकेशन_और_जॉब', 'एशियन_गेम्स_2022', 'एशिया_कप_2023',
       'केंद्रीय_बजट_2023-24', 'कोरोना_वायरस', 'टी20_विश्व_कप_2024',
       'पेरिस_ओलंपिक्स', 'फाइनेंस', 'फील_गुड_स्टोरीज़', 'बजट_2024',
       'लोकसभा_चुनाव-_2024', 'लोकसभा_चुनाव_2024', 'विधानसभा_चुनाव_2023',
       'विश्व_कप_2023', 'हेल्थ_एंड_फिटनेस'],
      dtype='object')


#TFIDF Vectorization HyperParam Tuning Model Training and Eval

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import joblib

# Assuming df['augmented_text'] is your input text and df['News Categories'] are the labels
X = df['augmented_text']
y = df['News Categories']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using bigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Binarize the labels
mlb = MultiLabelBinarizer()
y_train_bin = mlb.fit_transform(y_train)
y_test_bin = mlb.transform(y_test)

# Step to save the vectorizer, and binarizer
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(mlb, 'multi_label_binarizer.joblib')

print("Vectorizer and binarizer saved successfully!")


Vectorizer and binarizer saved successfully!


tfidf logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multiclass import OneVsRestClassifier

# Define Logistic Regression model
logreg = LogisticRegression(max_iter=500, random_state=42)

# Wrap Logistic Regression in OneVsRestClassifier for multilabel classification
ovr_logreg = OneVsRestClassifier(logreg)

# Define hyperparameter grid for GridSearch
param_grid = {
    'estimator__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'estimator__solver': ['liblinear', 'saga']  # Different solvers for optimization
}

# Initialize GridSearchCV
grid_search = GridSearchCV(ovr_logreg, param_grid, cv=3, scoring='accuracy', verbose=3)

# Fit the model
grid_search.fit(X_train_tfidf, y_train_bin)

# Print the best parameters found by GridSearch
print(f"Best Parameters: {grid_search.best_params_}")

# Print the best score from GridSearchCV
print(f"Best Cross-Validated Score: {grid_search.best_score_:.4f}")

# Evaluate on test data
y_pred = grid_search.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test_bin, y_pred, target_names=mlb.classes_))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END estimator__C=0.01, estimator__solver=liblinear;, score=0.101 total time=   7.3s
[CV 2/3] END estimator__C=0.01, estimator__solver=liblinear;, score=0.102 total time=   4.4s
[CV 3/3] END estimator__C=0.01, estimator__solver=liblinear;, score=0.101 total time=   3.7s
[CV 1/3] END estimator__C=0.01, estimator__solver=saga;, score=0.099 total time=  15.5s
[CV 2/3] END estimator__C=0.01, estimator__solver=saga;, score=0.100 total time=  14.7s
[CV 3/3] END estimator__C=0.01, estimator__solver=saga;, score=0.098 total time=  14.5s
[CV 1/3] END estimator__C=0.1, estimator__solver=liblinear;, score=0.509 total time=   5.0s
[CV 2/3] END estimator__C=0.1, estimator__solver=liblinear;, score=0.513 total time=   5.7s
[CV 3/3] END estimator__C=0.1, estimator__solver=liblinear;, score=0.505 total time=   4.9s
[CV 1/3] END estimator__C=0.1, estimator__solver=saga;, score=0.508 total time=  15.7s
[CV 2/3] END estimator__C=0.1, es



[CV 1/3] END estimator__C=100, estimator__solver=saga;, score=0.906 total time= 5.0min




[CV 2/3] END estimator__C=100, estimator__solver=saga;, score=0.910 total time= 5.0min




[CV 3/3] END estimator__C=100, estimator__solver=saga;, score=0.905 total time= 4.9min
Best Parameters: {'estimator__C': 100, 'estimator__solver': 'liblinear'}
Best Cross-Validated Score: 0.9069
Classification Report:
                    precision    recall  f1-score   support

         business       0.98      0.99      0.98      4390
    entertainment       0.99      0.99      0.99      3869
    miscellaneous       0.96      0.97      0.96      3538
         national       0.93      0.92      0.92     10670
         politics       0.94      0.95      0.95      4819
           sports       1.00      0.99      0.99      3605
       technology       0.99      0.99      0.99      4255
            world       0.97      0.96      0.97      4838
      आईपीएल_2024       0.99      1.00      0.99       958
 एशियन_गेम्स_2022       0.99      1.00      1.00       393
लोकसभा_चुनाव_2024       0.94      0.93      0.94       822
    विश्व_कप_2023       0.98      0.99      0.98       686

        micr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Step to save the model, vectorizer, and binarizer
joblib.dump(grid_search.best_estimator_, 'logistic_regression_model.joblib')

print("Logistic Regression Model saved successfully!")


Logistic Regression Model saved successfully!


tfidf naive bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

# Define Naive Bayes model (Multinomial Naive Bayes is suitable for text classification)
nb_model = MultinomialNB()

# Wrap Naive Bayes in OneVsRestClassifier for multilabel classification
ovr_nb = OneVsRestClassifier(nb_model)

# Define hyperparameter grid for GridSearch
param_grid = {
    'estimator__alpha': [0.01, 0.1, 1, 10],  # Smoothing parameter
    'estimator__fit_prior': [True, False]    # Whether to learn class prior probabilities
}

# Initialize GridSearchCV
grid_search = GridSearchCV(ovr_nb, param_grid, cv=3, scoring='accuracy', verbose=3)

# Fit the model
grid_search.fit(X_train_tfidf, y_train_bin)

# Print the best parameters found by GridSearch
print(f"Best Parameters: {grid_search.best_params_}")

# Print the best score from GridSearchCV
print(f"Best Cross-Validated Score: {grid_search.best_score_:.4f}")

# Evaluate on test data
y_pred = grid_search.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test_bin, y_pred, target_names=mlb.classes_))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END estimator__alpha=0.01, estimator__fit_prior=True;, score=0.638 total time=   1.3s
[CV 2/3] END estimator__alpha=0.01, estimator__fit_prior=True;, score=0.642 total time=   1.2s
[CV 3/3] END estimator__alpha=0.01, estimator__fit_prior=True;, score=0.636 total time=   1.2s
[CV 1/3] END estimator__alpha=0.01, estimator__fit_prior=False;, score=0.559 total time=   0.8s
[CV 2/3] END estimator__alpha=0.01, estimator__fit_prior=False;, score=0.564 total time=   0.8s
[CV 3/3] END estimator__alpha=0.01, estimator__fit_prior=False;, score=0.558 total time=   0.8s
[CV 1/3] END estimator__alpha=0.1, estimator__fit_prior=True;, score=0.631 total time=   0.8s
[CV 2/3] END estimator__alpha=0.1, estimator__fit_prior=True;, score=0.634 total time=   0.8s
[CV 3/3] END estimator__alpha=0.1, estimator__fit_prior=True;, score=0.629 total time=   0.7s
[CV 1/3] END estimator__alpha=0.1, estimator__fit_prior=False;, score=0.509 total tim

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Step to save the model
joblib.dump(grid_search.best_estimator_, 'naive_bayes_model.joblib')

print("Naive Bayes Model saved successfully!")


Naive Bayes Model saved successfully!


## Failed Models

In [None]:
import torch
import multiprocessing as mp
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from datasets import Dataset
import pandas as pd

# Ensure multiprocessing starts correctly
mp.set_start_method('spawn', force=True)

# Initialize the tokenizer and model for HiNER
tokenizer = AutoTokenizer.from_pretrained("cfilt/HiNER-original-xlm-roberta-large")
model = AutoModelForTokenClassification.from_pretrained("cfilt/HiNER-original-xlm-roberta-large")

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Initialize the NER pipeline
ner_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, device=device)

# Function to extract entities from a batch
def extract_entities(batch):
    try:
        # Join the words in the 'Filter_Words' column to form a single string for NER
        texts = [" ".join(words) for words in batch['words']]  # Assuming 'words' is the name of your column
        ner_results = ner_pipeline(texts)  # Run NER on the joined texts

        # Extract entities' 'word' field from the NER results
        batch['entities'] = [[entity['word'] for entity in ner_results[i]] for i in range(len(ner_results))]
    except Exception as e:
        print(f"Error processing batch: {e}")
        batch['entities'] = [[] for _ in range(len(batch['words']))]  # In case of error, return empty entity list
    return batch

# Assuming you have a DataFrame named 'df' with the 'Filter_Words' column
# Convert your DataFrame into Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['Filtered_Words']].rename(columns={'Filtered_Words': 'words'}))

# Process the dataset in batches with parallel processing
batch_size = 256
dataset = dataset.map(extract_entities, batched=True, batch_size=batch_size, num_proc=4)

# Convert the results back to a DataFrame if needed
df['entities'] = dataset['entities']

# Create augmented text by appending entities to the original text
df['augmented_text'] = df['Filtered_Words'].apply(lambda words: " ".join(words)) + " " + df['entities'].apply(lambda x: ' '.join(x))

# Display results
print(df[['Filtered_Words', 'entities', 'augmented_text']].head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/157861 [00:00<?, ? examples/s]

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method

Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start methodError processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Error processing batch: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start 

In [None]:
# Step 1: Stratified Sampling for Hyperparameter Tuning
sample_size = 1000
df_sample, _ = train_test_split(df, test_size=len(df) - sample_size, stratify=df['News Categories'], random_state=42)

# Vectorize the Sample Text
tfidf = TfidfVectorizer(max_features=3000)
X_sample = tfidf.fit_transform(df_sample['augmented_text'])

# Convert the multilabel target into binary format
mlb = MultiLabelBinarizer()
y_sample = mlb.fit_transform(df_sample['News Categories'])

In [None]:
# Step 2: Hyperparameter Tuning using GridSearchCV

# Random Forest and SVM models
rf_model = OneVsRestClassifier(RandomForestClassifier(random_state=42))
svm_model = OneVsRestClassifier(SVC(probability=True, random_state=42))

# Parameter grids for tuning
param_grid_rf = {
    'estimator__n_estimators': [100, 200],
    'estimator__max_depth': [10, 20, None],
}

param_grid_svm = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf'],
}

# Grid Search for Random Forest on the sample
rf_grid_search = GridSearchCV(rf_model, param_grid_rf, cv=3, scoring='accuracy', verbose=3)
rf_grid_search.fit(X_sample, y_sample)

# Grid Search for SVM on the sample
svm_grid_search = GridSearchCV(svm_model, param_grid_svm, cv=3, scoring='accuracy', verbose=3)
svm_grid_search.fit(X_sample, y_sample)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END estimator__max_depth=10, estimator__n_estimators=100;, score=0.027 total time=   2.7s
[CV 2/3] END estimator__max_depth=10, estimator__n_estimators=100;, score=0.036 total time=   2.4s
[CV 3/3] END estimator__max_depth=10, estimator__n_estimators=100;, score=0.030 total time=   2.4s
[CV 1/3] END estimator__max_depth=10, estimator__n_estimators=200;, score=0.027 total time=   5.6s
[CV 2/3] END estimator__max_depth=10, estimator__n_estimators=200;, score=0.021 total time=   5.0s
[CV 3/3] END estimator__max_depth=10, estimator__n_estimators=200;, score=0.021 total time=   4.8s
[CV 1/3] END estimator__max_depth=20, estimator__n_estimators=100;, score=0.150 total time=   3.9s
[CV 2/3] END estimator__max_depth=20, estimator__n_estimators=100;, score=0.126 total time=   3.0s
[CV 3/3] END estimator__max_depth=20, estimator__n_estimators=100;, score=0.078 total time=   2.9s
[CV 1/3] END estimator__max_depth=20, estimator__

In [None]:
print("Best Parameters RF:", rf_grid_search.best_params_)
print("Best Score RF:", rf_grid_search.best_score_)

print("Best Parameters SVM:", svm_grid_search.best_params_)
print("Best Score SVM:", svm_grid_search.best_score_)

Best Parameters RF: {'estimator__max_depth': None, 'estimator__n_estimators': 200}
Best Score RF: 0.19397840954727186
Best Parameters SVM: {'estimator__C': 10, 'estimator__kernel': 'linear'}
Best Score SVM: 0.400001798205391


In [None]:
# Step 3: Train the models with the full dataset using the best parameters

# Full dataset transformation
X_full = tfidf.transform(df['augmented_text']).toarray()
y_full = mlb.transform(df['News Categories'])

# Random Forest with optimal parameters
best_rf_model = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=rf_grid_search.best_params_['estimator__n_estimators'],
    max_depth=rf_grid_search.best_params_['estimator__max_depth'],
    random_state=42
))
best_rf_model.fit(X_full, y_full)

# SVM with optimal parameters
best_svm_model = OneVsRestClassifier(SVC(
    C=svm_grid_search.best_params_['estimator__C'],
    kernel=svm_grid_search.best_params_['estimator__kernel'],
    probability=True,
    random_state=42
))
best_svm_model.fit(X_full, y_full)

NameError: name 'tfidf' is not defined

In [None]:
# Step 4: Evaluate models on the full dataset (you can also use a hold-out test set)
y_pred_rf = best_rf_model.predict(X_test_tfidf)
y_pred_svm = best_svm_model.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Display results for Random Forest
print("Random Forest Classification Report:")
print(classification_report(y_full, y_pred_rf, target_names=mlb.classes_))

# Display results for SVM
print("SVM Classification Report:")
print(classification_report(y_full, y_pred_svm, target_names=mlb.classes_))

print(accuracy_rf)
print(accuracy_svm)