In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2,mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

In [21]:
newsgroups_data = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))

In [22]:
vectorizer = TfidfVectorizer(stop_words='english',max_features=10000,token_pattern='[A-Za-z]+',use_idf=False)  # Remove stopwords and apply TF-IDF
ng_group_tf_vector = vectorizer.fit_transform(newsgroups_data.data)
ng_feature_names = vectorizer.get_feature_names_out()

# Problem 3

In [27]:
X = ng_group_tf_vector
y_labels = newsgroups_data.target
chi2_stats, p_values  = chi2(X,y_labels)

In [28]:
chi2_stats

array([ 9.23564887, 53.10781836, 16.10841363, ..., 51.95146857,
        0.15003492,  6.20968181])

In [29]:
top_200_features = np.argsort(chi2_stats)[::-1][:200]

In [30]:
top_200_features

array([9707, 3709, 9833,  868, 1236, 4608, 8293, 7743, 3568, 4061, 1527,
       2877, 4804, 8796, 3837, 4667, 9705, 5262, 4609,  467, 3769,  392,
       8057, 4672, 1444,  468,  424, 5709, 6091, 2671, 5999, 1433, 1259,
       2601, 9185, 3569, 2627, 3738, 1434, 4198,  737, 7859, 5915,  857,
       3293, 7976, 1418, 7876, 3838, 3228, 6184, 4809, 1240, 8093, 1436,
        427, 6610, 8137, 2966, 1753, 7555, 1889, 2972,  871, 2716, 9689,
       5811, 4248, 4964, 4986, 9878, 2592, 7560, 4671, 5682, 2895, 3296,
       2358, 2081, 1138, 3178,  243, 6613, 5750, 6441, 5417, 6563, 9702,
       1041,  752, 2529, 7356, 5000, 6566, 1328, 5669, 5718, 7897,  565,
       1435, 1462, 1086, 9187, 7069, 5781, 3606, 5782, 9914, 5001,  508,
       2875, 6845, 6612, 3422, 9645, 6439, 4767,  466, 5463,  890, 2690,
       1451, 6606, 6395, 3324, 1389, 8171, 8797, 1172, 3632, 7896,  567,
       8294, 4797, 6495, 4884, 5526, 8019, 6305, 4479, 6889, 4043, 8291,
       2709, 6609, 3727, 2421, 5199, 6218, 2409, 64

In [32]:
vocabulary = np.array(ng_feature_names)
i=1
for vocab in vocabulary[top_200_features]:
    print(f'Feature {i}:{vocab}')
    i+=1

Feature 1:windows
Feature 2:god
Feature 3:x
Feature 4:bike
Feature 5:car
Feature 6:israel
Feature 7:space
Feature 8:sale
Feature 9:game
Feature 10:hockey
Feature 11:clipper
Feature 12:encryption
Feature 13:key
Feature 14:team
Feature 15:gun
Feature 16:jesus
Feature 17:window
Feature 18:mac
Feature 19:israeli
Feature 20:armenian
Feature 21:graphics
Feature 22:apple
Feature 23:shipping
Feature 24:jews
Feature 25:church
Feature 26:armenians
Feature 27:arab
Feature 28:motif
Feature 29:offer
Feature 30:drive
Feature 31:nsa
Feature 32:christ
Feature 33:cars
Feature 34:dod
Feature 35:turkish
Feature 36:games
Feature 37:dos
Feature 38:government
Feature 39:christian
Feature 40:ide
Feature 41:baseball
Feature 42:scsi
Feature 43:nhl
Feature 44:bible
Feature 45:file
Feature 46:server
Feature 47:chip
Feature 48:season
Feature 49:guns
Feature 50:fbi
Feature 51:orbit
Feature 52:keys
Feature 53:card
Feature 54:shuttle
Feature 55:christians
Feature 56:arabs
Feature 57:players
Feature 58:sin
Feature 59

In [33]:
X = ng_group_tf_vector[:,top_200_features]
y = newsgroups_data.target
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [34]:
log_reg = LogisticRegression(penalty='l2',solver='liblinear')
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_test)
acc = accuracy_score(y_test,y_pred)

In [35]:
 print(f'Accuracy score:%0.3f'% acc)

Accuracy score:0.536


The accuracy we got here is less compared to what we got in HW_3A the main reason I think is the number of features that we are selecting becomes very important here.

In [38]:
  # Ensure labels are discrete integer
from sklearn.feature_selection import mutual_info_classif

mutual_info_20_ng = mutual_info_classif(ng_group_tf_vector, y_labels,discrete_features=True)



In [43]:
top_200_features = np.argsort(mutual_info_20_ng)[::-1][:200]

In [44]:
vocabulary = np.array(ng_feature_names)
i=1
for vocab in vocabulary[top_200_features]:
    print(f'Feature {i}:{vocab}')
    i+=1

Feature 1:s
Feature 2:t
Feature 3:like
Feature 4:don
Feature 5:just
Feature 6:people
Feature 7:know
Feature 8:time
Feature 9:m
Feature 10:think
Feature 11:use
Feature 12:does
Feature 13:good
Feature 14:way
Feature 15:make
Feature 16:new
Feature 17:d
Feature 18:say
Feature 19:used
Feature 20:did
Feature 21:right
Feature 22:want
Feature 23:ve
Feature 24:x
Feature 25:need
Feature 26:problem
Feature 27:really
Feature 28:work
Feature 29:using
Feature 30:point
Feature 31:e
Feature 32:believe
Feature 33:things
Feature 34:said
Feature 35:years
Feature 36:better
Feature 37:going
Feature 38:help
Feature 39:ll
Feature 40:doesn
Feature 41:c
Feature 42:long
Feature 43:let
Feature 44:sure
Feature 45:read
Feature 46:year
Feature 47:fact
Feature 48:come
Feature 49:question
Feature 50:look
Feature 51:edu
Feature 52:thing
Feature 53:probably
Feature 54:case
Feature 55:little
Feature 56:god
Feature 57:got
Feature 58:number
Feature 59:different
Feature 60:world
Feature 61:course
Feature 62:best
Feature 63

In [45]:
X = ng_group_tf_vector[:,top_200_features]
y = newsgroups_data.target
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

log_reg = LogisticRegression(penalty='l2',solver='liblinear')
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_test)
acc = accuracy_score(y_test,y_pred)

In [48]:
print('Accuracy for 20NG :%0.2f'% acc)

Accuracy for 20NG :0.32
