In [3]:
!pip install scikit-learn
import sklearn
from sklearn.datasets import fetch_20newsgroups

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   ------- -------------------------------- 1.6/8.9 MB 9.4 MB/s eta 0:00:01
   --------------- ------------------------ 3.4/8.9 MB 8.8 MB/s eta 0:00:01
   ------------------------ --------------- 5.5/8.9 MB 9.1 MB/s eta 0:00:01
   ----------------------------- ---------- 6.6/8.9 MB 8.1 MB/s eta 0:00:01
   ------------------------------------ --- 8.1/8.9 MB 7.9 MB/s eta 0:00:01
   ---------------------------------------- 8.9/8.9 MB 7.7 MB/s  0:00:01


In [4]:
import pandas as pd

In [14]:
#Load the data
posts = fetch_20newsgroups(subset = 'all', categories =['sci.electronics', 'sci.space'], remove = ('headers', 'footers', 'quotes'))

In [15]:
#See the available keys in dataset
print(posts.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [16]:
#Display One data point
print(posts.data[1])

AL>>        Question:   Is there a certain device out there that I can
AL>>                    use to find out the number to the line?
AL>>        Thanks for any response.
AL>>                                                    Al

AL>There is a number you can call which will return a synthesized
AL>voice telling you the number of the line.  Unfortunately, for the
AL>life of me I can't remember what it is. The telephone technicians
AL>use it all the time.  We used to play around with this in our
AL>dorm rooms since there were multiple phone lines running between
AL>rooms.

It probably wouldn't help for you to post the number, since it appears
to be different in each area.  For what it's worth, in the New Orleans
area the number is 998-877-6655 (easy to remember, what?)


 * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.
                                          


In [17]:
#Create a data Frame
df = pd.DataFrame({
    'text': posts.data,
    'label': [posts.target_names[target] for target in posts.target]})

In [18]:
print(df.head())

                                                text            label
0  \n   >\tIf the  new  Kuiper belt object *is*  ...        sci.space
1  AL>>        Question:   Is there a certain dev...  sci.electronics
2  \nIt's not quite what you were asking, but a f...        sci.space
3  \n\n\nNo, the sky does not, at this time, belo...        sci.space
4   \nDigi-Key also sells Quad Line Receivers, pa...  sci.electronics


In [12]:
print(df.shape)

(1971, 2)


In [13]:
#Data Cleaning and Preprocessing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sampa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sampa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sampa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
def clean_text(text):
    #Tokenization
    tokens = word_tokenize(text)

    #Remove tokens that are not purely letters
    tokens = [word for word in tokens if word.isalpha()]

    #Lowercase the text
    tokens = [word.lower() for word in tokens]

    #Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    #Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    #Join tokens back into a single string
    clean_text = ' '.join(tokens)
    return clean_text

In [23]:
#Apply this above clean_text function to the dataset
df['clean_text'] = df['text'].apply(clean_text)

In [24]:
print(df.head())

                                                text            label  \
0  \n   >\tIf the  new  Kuiper belt object *is*  ...        sci.space   
1  AL>>        Question:   Is there a certain dev...  sci.electronics   
2  \nIt's not quite what you were asking, but a f...        sci.space   
3  \n\n\nNo, the sky does not, at this time, belo...        sci.space   
4   \nDigi-Key also sells Quad Line Receivers, pa...  sci.electronics   

                                          clean_text  
0  new kuiper belt object called next one called ...  
1  al question certain device al use find number ...  
2  quite asking year ago helped ee remote sensing...  
3  sky time belong anyone ownership necessary def...  
4  also sell quad line receiver part quad line dr...  


In [25]:
clean_data = df[['clean_text', 'label']]
print(clean_data.head())

                                          clean_text            label
0  new kuiper belt object called next one called ...        sci.space
1  al question certain device al use find number ...  sci.electronics
2  quite asking year ago helped ee remote sensing...        sci.space
3  sky time belong anyone ownership necessary def...        sci.space
4  also sell quad line receiver part quad line dr...  sci.electronics


In [27]:
#Perform train test split
X = df['clean_text']
y = df['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.25, random_state = 42)

In [28]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1478,) (1478,) (493,) (493,)


In [29]:
#Vectorize the data
#Bag-of-words
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(min_df=10)

#Fit and transform the training data
X_train_counts = count_vect.fit_transform(X_train)

In [30]:

#Transform the test data
X_test_counts = count_vect.transform(X_test)

In [31]:
#Display Feature Names

counts_df = pd.DataFrame(X_train_counts.toarray(), columns = count_vect.get_feature_names_out())
print(counts_df.head())

   ability  able  absolutely  ac  acceleration  accepted  access  according  \
0        0     0           0   0             0         0       0          0   
1        0     0           0   0             0         0       0          0   
2        0     0           0   0             0         0       0          0   
3        0     0           0   0             0         0       0          0   
4        0     0           0   0             0         0       0          0   

   account  accuracy  ...  wrong  wrote  yeah  year  yellow  yes  yesterday  \
0        0         0  ...      0      0     0     0       0    0          0   
1        0         0  ...      0      0     0     0       0    0          0   
2        0         0  ...      0      0     0     0       0    0          0   
3        0         0  ...      0      0     0     0       0    0          0   
4        0         0  ...      0      0     0     0       0    0          0   

   yet  york  zero  
0    0     0     0  
1    0  

In [32]:

count_vect_ngram = CountVectorizer(min_df=10, ngram_range = (2,2))

In [33]:

#Fit and transform the training data
X_train_counts = count_vect_ngram.fit_transform(X_train)
#Transform the test data
X_test_counts = count_vect_ngram.transform(X_test)
#Display Feature Names

counts_ngram_df = pd.DataFrame(X_train_counts.toarray(), columns = count_vect_ngram.get_feature_names_out())
print(counts_ngram_df.head())

   al uucp  also available  ames dryden  anonymous ftp  answer question  \
0        0               0            0              0                0   
1        0               0            0              0                0   
2        0               0            0              0                0   
3        0               0            0              0                0   
4        0               0            0              0                0   

   anybody know  anyone know  appreciated thanks  around office  \
0             0            0                   0              0   
1             0            0                   0              0   
2             0            0                   0              0   
3             0            0                   0              0   
4             0            0                   0              0   

   available via  ...  would appreciated  would go  would greatly  would help  \
0              0  ...                  0         0              0

In [35]:
#TF- IDF 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df = 0.7, min_df = 0.01)

In [36]:
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

In [38]:
#Display Feature Names 

tfidf_df = pd.DataFrame(tfidf_train.toarray(), columns = tfidf_vect.get_feature_names_out())
print(tfidf_df.head())

   able  absolutely   ac  access  according  across  act  action  active  \
0   0.0         0.0  0.0     0.0        0.0     0.0  0.0     0.0     0.0   
1   0.0         0.0  0.0     0.0        0.0     0.0  0.0     0.0     0.0   
2   0.0         0.0  0.0     0.0        0.0     0.0  0.0     0.0     0.0   
3   0.0         0.0  0.0     0.0        0.0     0.0  0.0     0.0     0.0   
4   0.0         0.0  0.0     0.0        0.0     0.0  0.0     0.0     0.0   

   activity  ...  world  worse     worth     would  write  written  wrong  \
0       0.0  ...    0.0    0.0  0.000000  0.000000    0.0      0.0    0.0   
1       0.0  ...    0.0    0.0  0.000000  0.100928    0.0      0.0    0.0   
2       0.0  ...    0.0    0.0  0.000000  0.148228    0.0      0.0    0.0   
3       0.0  ...    0.0    0.0  0.109279  0.094371    0.0      0.0    0.0   
4       0.0  ...    0.0    0.0  0.000000  0.000000    0.0      0.0    0.0   

   year  yes  yet  
0   0.0  0.0  0.0  
1   0.0  0.0  0.0  
2   0.0  0.0  0.0  


In [41]:
#Train the classifier and predict the test data
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb = MultinomialNB()
nb.fit(X_train_counts, y_train)

y_pred = nb.predict(X_test_counts )
metrics.accuracy_score(y_test,y_pred)


0.59026369168357

In [40]:
labels=['sci.electronics', 'sci.space']

cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)

#Crate a df from confusion matrix

cm_df = pd.DataFrame(cm, index=labels, columns=labels)


#Print the confusion matrix
print("Confusion Matrix:", cm_df)

Confusion Matrix:                  sci.electronics  sci.space
sci.electronics               93        156
sci.space                     46        198
