In [2]:
from sklearn.datasets import fetch_20newsgroups
import string
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

Data Collection

In [3]:
ng_train=fetch_20newsgroups(subset='train',remove=('header','footers','quotes'))
ng_test=fetch_20newsgroups(subset='test', remove=('headers','footers','quotes'))

Preprocessing

In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [7]:
ng_train.data = [preprocess_text(text) for text in ng_train.data]
ng_test.data = [preprocess_text(text) for text in ng_test.data]
print(ng_train.data[0])

['', 'lerxst', '', 'wamumdedu', '', 'thing', '', 'subject', '', 'car', '', '', 'nntppostinghost', '', 'rac3wamumdedu', 'organization', '', 'university', 'maryland', '', 'college', 'park', 'line', '', '15', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', '', '2door', 'sport', 'car', '', 'looked', 'late', '60', 'early', '70', '', 'called', 'bricklin', '', 'door', 'really', 'small', '', 'addition', '', 'front', 'bumper', 'separate', 'rest', 'body', '', 'know', '', 'anyone', 'tellme', 'model', 'name', '', 'engine', 'spec', '', 'year', 'production', '', 'car', 'made', '', 'history', '', 'whatever', 'info', 'funky', 'looking', 'car', '', 'please', 'email', '']


Feature Extraction

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(text) for text in ng_train.data])
X_test_tfidf = tfidf_vectorizer.transform([' '.join(text) for text in ng_test.data])


In [10]:
print("Shape of X_train_tfidf:", X_train_tfidf.shape)
print("Shape of X_test_tfidf:", X_test_tfidf.shape)

Shape of X_train_tfidf: (11314, 112809)
Shape of X_test_tfidf: (7532, 112809)


Model Training and Evaluation

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm  import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

MultinomialNB

In [12]:
# Verify data alignment
if X_train_tfidf.shape[0] != len(ng_train.target):
    raise ValueError("Number of samples in X_train_tfidf does not match the number of target labels in ng_train")

if X_test_tfidf.shape[0] != len(ng_test.target):
    raise ValueError("Number of samples in X_test_tfidf does not match the number of target labels in ng_test")

In [13]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, ng_train.target)

predictions = clf.predict(X_test_tfidf)

# Evaluate the accuracy of the model
accuracy = accuracy_score(ng_test.target, predictions)
print("Accuracy of the Multinomial Naive Bayes classifier:", accuracy)

Accuracy of the Multinomial Naive Bayes classifier: 0.657328730748805


SVC

In [14]:
from sklearn.svm import SVC

In [15]:
svc_clf = SVC()
svc_clf.fit(X_train_tfidf, ng_train.target)

In [16]:
svc_predictions = svc_clf.predict(X_test_tfidf)

In [17]:
svc_accuracy = accuracy_score(ng_test.target, svc_predictions)
print("Accuracy of the Support Vector Classification (SVC) model:", svc_accuracy)

Accuracy of the Support Vector Classification (SVC) model: 0.659984067976633


K-Means Clustering

In [19]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [36]:
kmeans = KMeans(n_clusters=21, random_state=42)

In [37]:
kmeans.fit(X_train_tfidf)

In [38]:
cluster_labels = kmeans.labels_

In [39]:
silhouette_avg = silhouette_score(X_train_tfidf, cluster_labels)
print("Silhouette Score:", silhouette_avg)

Silhouette Score: 0.005799856662749661


Hieararchical Clustering

In [40]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

In [42]:
agg_clustering = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward')
cluster_labels_agg = agg_clustering.fit_predict(X_train_tfidf)

TypeError: AgglomerativeClustering.__init__() got an unexpected keyword argument 'affinity'