In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [None]:
import pandas as pd
import numpy as np

file_path = "/content/train.csv"

total_rows = 120000

number_of_rows = 10000

random_state = 42

np.random.seed(random_state)
skip = sorted(np.random.choice(np.arange(1, total_rows+1), (total_rows - number_of_rows), replace=False))

dataset = pd.read_csv(file_path, skiprows=skip)
print(dataset.head())


In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
print(dataset.columns)

In [None]:
dataset['Title'].value_counts()

In [None]:
target_category = dataset['Class Index'].unique()
print(target_category)

In [None]:
dataset['CategoryId'] = dataset['Class Index'].factorize()[0]
dataset.head()

In [None]:
category = dataset[['Class Index', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

In [None]:
category_counts = dataset['Class Index'].value_counts()
category_counts.plot(kind="bar", color=["pink", "orange", "red", "yellow"])
plt.xlabel("Category of data")
plt.ylabel("Number of articles")
plt.title("Visualize numbers of Category of data")
plt.xticks(ticks=range(len(category_counts)), labels=['World News', 'Sports News', 'Business News', 'Science & Tech'], rotation=0)
plt.show()


In [None]:
import matplotlib.pyplot as plt

counts = dataset['CategoryId'].value_counts().sort_index()

categories = ['World News', 'Sports News', 'Business News', 'Science & Tech']

colors = ['skyblue', 'green', 'red', 'purple']

explode = [0.1] * len(categories)


plt.figure(figsize=(7, 7))
plt.pie(counts, labels=categories, autopct='%1.1f%%', colors=colors, startangle=140, explode=explode)
plt.title('Distribution of News Categories')
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords


STOP_WORDS = stopwords.words('english')
stopwords = set(STOP_WORDS)
stopwords.add("said")
stopwords.remove("not")
stopwords.remove("no")
stopwords.add(" ")

def generate_wordcloud(text, title):
    plt.figure(figsize=(10, 15))
    wc = WordCloud(max_words=500, background_color='white', stopwords=stopwords)
    wc.generate(" ".join(text))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.show()

for index, category in enumerate(['World', 'Sports', 'Business', 'Sci-Tech'], start=1):
    category_text = dataset['Description'][dataset['Class Index'] == index]
    generate_wordcloud(category_text, category)


In [None]:
def remove_tags(text):
    remove = re.compile(r'<.*?>')
    return re.sub(remove, '', text)

dataset['Description'] = dataset['Description'].apply(remove_tags)

In [None]:
def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews
dataset['Description'] = dataset['Description'].apply(special_char)

In [None]:
def convert_lower(text):
   return text.lower()
dataset['Description'] = dataset['Description'].apply(convert_lower)
dataset['Description'][1]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    return [x for x in words if x.lower() not in stop_words]

dataset['Description'] = dataset['Description'].apply(remove_stopwords)

dataset['Description'][1]


In [None]:
def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])
dataset['Description'] = dataset['Description'].apply(lemmatize_word)
dataset['Description'][1]

In [None]:
dataset.head()

In [None]:
x = dataset['Description']
y = dataset['CategoryId']

#Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
x = np.array(dataset.iloc[:,0].values)
y = np.array(dataset.CategoryId.values)
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(dataset.Description).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

In [None]:
perform_list = [ ]

In [None]:
def run_model(model_name, est_c, est_pnlty):

    mdl = ''

    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()

    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)

    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB(alpha=1.0, fit_prior=True)

    elif model_name == 'Support Vector Classifier':
        mdl = SVC()

    elif model_name == 'Decision Tree Classifier':
        mdl = DecisionTreeClassifier()

    elif model_name == 'K Nearest Neighbour':
        mdl = KNeighborsClassifier(n_neighbors=10, metric='minkowski', p=4)

    elif model_name == 'Gaussian Naive Bayes':
        mdl = GaussianNB()

    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(x_train, y_train)
    y_pred = oneVsRest.predict(x_test)

    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)

    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall : {recall}')
    print(f'F1-score : {f1score}')

    perform_list.append(dict([
        ('Model', model_name),
        ('Test Accuracy', round(accuracy, 2)),
        ('Precision', round(precision, 2)),
        ('Recall', round(recall, 2)),
        ('F1', round(f1score, 2))
    ]))


In [None]:
# run_model('Logistic Regression', est_c=None, est_pnlty=None)

In [None]:
run_model('Random Forest', est_c=None, est_pnlty=None)

In [None]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
run_model('Decision Tree Classifier', est_c=None, est_pnlty=None)

In [None]:
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
from IPython.display import display, Markdown
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]

display(Markdown("# Bag of Words Model Performance metrics"))

display(model_performance)

In [None]:
model = model_performance["Model"]
max_value = model_performance["Test Accuracy"].max()
print("The best accuracy of model is", max_value,"from Random")

In [None]:
classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0).fit(x_train, y_train)
classifier
y_pred = classifier.predict(x_test)

In [None]:
y_pred1 = cv.transform(['Hour ago, I contemplated retirement for a lot of reasons. I felt like people were not sensitive enough to my injuries. I felt like a lot of people were backed, why not me? I have done no less. I have won a lot of games for the team, and I am not feeling backed, said Ashwin'])
yy = classifier.predict(y_pred1)
result = ""

if yy == [1]:
  result = "World News"
elif yy == [2]:
  result = "Sports News"
elif yy == [3]:
  result = "Business News"
elif yy == [4]:
  result = "Science & Tech News"

print(result)


##Word2Vec

In [None]:
!pip install --upgrade numpy
!pip install --upgrade gensim

In [None]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split


nltk.download('punkt')
tokenized_texts = [word_tokenize(text.lower()) for text in dataset['Description']]

model_w2v = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
word_vectors = model_w2v.wv


def document_vector(word_vec_model, doc):
    doc_vector = np.mean([word_vec_model[word] for word in doc if word in word_vec_model.key_to_index], axis=0)
    return doc_vector


document_vectors = np.array([document_vector(word_vectors, doc) for doc in tokenized_texts if any(word in word_vectors.key_to_index for word in doc)])


valid_labels = dataset['CategoryId'][[any(word in word_vectors.key_to_index for word in doc) for doc in tokenized_texts]]

X_train, X_test, y_train, y_test = train_test_split(document_vectors, valid_labels, test_size=0.3, random_state=0)



In [None]:
print(len(X_train))
print(len(X_test))

In [None]:
perform_list2 = [ ]


In [None]:
def run_model(model_name, est_c, est_pnlty):

    mdl = ''

    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()

    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)

    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB(alpha=1.0, fit_prior=True)

    elif model_name == 'Support Vector Classifier':
        mdl = SVC()

    elif model_name == 'Decision Tree Classifier':
        mdl = DecisionTreeClassifier()

    elif model_name == 'K Nearest Neighbour':
        mdl = KNeighborsClassifier(n_neighbors=10, metric='minkowski', p=4)

    elif model_name == 'Gaussian Naive Bayes':
        mdl = GaussianNB()

    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(X_train, y_train)
    y_pred = oneVsRest.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)

    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall : {recall}')
    print(f'F1-score : {f1score}')

    perform_list2.append(dict([
        ('Model', model_name),
        ('Test Accuracy', round(accuracy, 2)),
        ('Precision', round(precision, 2)),
        ('Recall', round(recall, 2)),
        ('F1', round(f1score, 2))
    ]))


In [None]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

In [None]:
run_model('Random Forest', est_c=None, est_pnlty=None)

In [None]:
# run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
# run_model('Support Vector Classifer', est_c=None, est_pnlty=None)

In [None]:
run_model('Decision Tree Classifier', est_c=None, est_pnlty=None)

In [None]:
run_model('K Nearest Neighbour', est_c=None, est_pnlty=None)

In [None]:
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
model_performance = pd.DataFrame(data=perform_list2)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]

display(Markdown("# Word2Vec Model Performance metrics"))

display(model_performance)

In [None]:
model = model_performance["Model"]
max_value = model_performance["Test Accuracy"].max()
print("The best accuracy of model is", max_value,"from Random")

In [None]:
classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0).fit(X_train, y_train)
classifier
y_pred = classifier.predict(X_test)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = dataset['Description']
y = dataset['CategoryId']


cv = CountVectorizer(max_features=100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

classifier = RandomForestClassifier(random_state=0)

classifier.fit(X_train_cv, y_train)

new_text = ['Hour ago, I contemplated retirement for a lot of reasons. I felt like people were not sensitive enough to my injuries. I felt like a lot of people were backed, why not me? I have done no less. I have won a lot of games for the team, and I am not feeling backed, said Ashwin']
new_text_cv = cv.transform(new_text)

y_pred = classifier.predict(new_text_cv)



In [None]:

category_id_to_name = {
    1: "World News",
    2: "Sports News",
    3: "Business News",
    4: "Sci/Tech News"
}


y_pred = classifier.predict(new_text_cv)

predicted_category_name = category_id_to_name.get(y_pred[0], 'Unknown Category')

print(f"The news is classified as: {predicted_category_name}")


#TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


x = np.array(dataset.iloc[:, 0].values)
y = np.array(dataset['CategoryId'].values)

tfidf = TfidfVectorizer(max_features=5000)

x = tfidf.fit_transform(dataset['Description']).toarray()

print("X.shape = ", x.shape)
print("y.shape = ", y.shape)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

In [None]:
perform_list3 = [ ]

In [None]:
def run_model(model_name, est_c, est_pnlty):

    mdl = ''

    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()

    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)

    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB(alpha=1.0, fit_prior=True)

    elif model_name == 'Support Vector Classifier':
        mdl = SVC()

    elif model_name == 'Decision Tree Classifier':
        mdl = DecisionTreeClassifier()

    elif model_name == 'K Nearest Neighbour':
        mdl = KNeighborsClassifier(n_neighbors=10, metric='minkowski', p=4)

    elif model_name == 'Gaussian Naive Bayes':
        mdl = GaussianNB()

    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(X_train, y_train)
    y_pred = oneVsRest.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)

    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall : {recall}')
    print(f'F1-score : {f1score}')

    perform_list3.append(dict([
        ('Model', model_name),
        ('Test Accuracy', round(accuracy, 2)),
        ('Precision', round(precision, 2)),
        ('Recall', round(recall, 2)),
        ('F1', round(f1score, 2))
    ]))


In [None]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

In [None]:
run_model('Random Forest', est_c=None, est_pnlty=None)

In [None]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
# run_model('Support Vector Classifer', est_c=None, est_pnlty=None)

In [None]:
run_model('Decision Tree Classifier', est_c=None, est_pnlty=None)

In [None]:
# run_model('K Nearest Neighbour', est_c=None, est_pnlty=None)

In [None]:
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
from IPython.display import display, Markdown

model_performance = pd.DataFrame(data=perform_list3)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]

display(Markdown("# TF-IDF Model Performance metrics"))

display(model_performance)


In [None]:
model = model_performance["Model"]
max_value = model_performance["Test Accuracy"].max()
print("The best accuracy of model is", max_value,"from Random")

In [None]:
classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0).fit(x_train, y_train)
classifier
y_pred = classifier.predict(x_test)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


X = dataset['Description']
y = dataset['CategoryId']


tfidf = TfidfVectorizer(max_features=5000)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


classifier = RandomForestClassifier(random_state=0)


classifier.fit(X_train_tfidf, y_train)


new_text = ["I am going to set up a new Microsoft business office in India. Stock market has good scope there!"]
new_text_tfidf = tfidf.transform(new_text)


y_pred = classifier.predict(new_text_tfidf)


category_id_to_name = {
    0: "Business News",
    1: "Tech News",
    2: "Politics News",
    3: "Sports News",
    4: "Entertainment News"
}


print(f"The news is classified as: {category_id_to_name.get(y_pred[0], 'Unknown Category')}")


In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support as score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np


In [None]:
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

X = dataset['Description'].astype(str).tolist()
X_embeddings = bert_model.encode(X, show_progress_bar=True)

y = np.array(dataset['CategoryId'])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.3, random_state=42, shuffle=True)


In [None]:
bert_performance = []

def run_model_on_bert(model_name):
    if model_name == 'Logistic Regression':
        model = LogisticRegression(max_iter=1000)
    elif model_name == 'Random Forest':
        model = RandomForestClassifier(n_estimators=100, random_state=0)
    elif model_name == 'Multinomial Naive Bayes':
        model = MultinomialNB()
    elif model_name == 'Gaussian Naive Bayes':
        model = GaussianNB()
    elif model_name == 'Decision Tree':
        model = DecisionTreeClassifier()
    elif model_name == 'KNN':
        model = KNeighborsClassifier(n_neighbors=10)
    elif model_name == 'SVC':
        model = SVC()
    else:
        return

    clf = OneVsRestClassifier(model)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f"{model_name} - Accuracy: {accuracy} | Precision: {precision:.2f} | Recall: {recall:.2f} | F1: {f1score:.2f}")

    bert_performance.append({
        'Model': model_name,
        'Test Accuracy': accuracy,
        'Precision': round(precision, 2),
        'Recall': round(recall, 2),
        'F1': round(f1score, 2)
    })


In [None]:
for model in ['Logistic Regression', 'Random Forest', 'Multinomial Naive Bayes', 'Gaussian Naive Bayes', 'Decision Tree', 'KNN', 'SVC']:
    run_model_on_bert(model)


In [None]:
bert_results_df = pd.DataFrame(bert_performance)
bert_results_df = bert_results_df[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
print("### BERT Model Performance Comparison ###")
print(bert_results_df)


In [None]:
pip install transformers datasets


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

df = pd.read_csv("/content/train.csv").dropna()
df = df[['Description', 'Class Index']]
df.columns = ['text', 'label']
df['label'] = df['label'] - 1


train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_dataset = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)


training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',

    save_strategy='epoch',
    save_total_limit=1
)


import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


trainer.train()
