## Set up Environment 

In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

## Load data and generate TDM

The vectoriser below uses the `token_pattern` parameter to remove numerics and underscores from the data.
Try it out with the parameter specified and not specified (i.e. using the default which uses a regex of `(?u)\b\w\w+\b` to see the impact on the tokens/features generated.




In [9]:
categories = ['talk.religion.misc','soc.religion.christian', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True, random_state=42)

X, Y = newsgroups_train.data, newsgroups_train.target

vectorizer = TfidfVectorizer(token_pattern=r'\b[^\d^\_\W]+\b')    

X_vec = vectorizer.fit_transform(X)   #transform training data into TDM


### Examine the features generated

In [10]:
ftr_names= vectorizer.get_feature_names_out()
print("Number of features: %d"  % len(ftr_names))
print("First 100: %s" % ftr_names[:100])


Number of features: 22059
First 100: ['a' 'aa' 'aaai' 'aacc' 'aanerud' 'aaron' 'aaronson' 'aasked' 'ab'
 'abacus' 'abandon' 'abandoned' 'abandoning' 'abandons' 'abates' 'abba'
 'abbasids' 'abbott' 'abbreviated' 'abbreviation' 'abd' 'abdel' 'abdomen'
 'abdominal' 'abduction' 'abdullah' 'abeit' 'aberdeen' 'aberrant'
 'aberration' 'aberrations' 'abhin' 'abhor' 'abhorent' 'abhorrent' 'abide'
 'abideth' 'abiding' 'abilities' 'ability' 'abingdon' 'abington'
 'abiogenesis' 'abjuring' 'ablazing' 'able' 'ably' 'abner' 'abnormal'
 'abnormalities' 'abnormally' 'aboard' 'abode' 'abodes' 'abolish'
 'abolished' 'abolishment' 'abolition' 'abolitionist' 'abolitionists'
 'abomb' 'abomination' 'abortion' 'abou' 'abound' 'abounded' 'about'
 'above' 'abput' 'abraam' 'abraham' 'abrahamic' 'abram' 'abreast' 'abri'
 'abridged' 'abroad' 'abruptly' 'abscence' 'abscess' 'absence' 'absent'
 'absol' 'absolute' 'absolutely' 'absolutes' 'absolutist' 'absolutists'
 'absorbed' 'absorbtion' 'absorption' 'abstacted' 'a

### Remove Stopwords  and add in Document Frequency reduction

In [11]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=3, token_pattern=r'\b[^\d^\_\W]+\b',stop_words="english")
X_vec = vectorizer.fit_transform(X)   #transform training data

ftr_names= vectorizer.get_feature_names()
print("Number of features: %d"  % len(ftr_names))
print("First 100: %s" % ftr_names[:100])



Number of features: 7163
First 100: ['aaron', 'abandon', 'abandoned', 'abdominal', 'aberrant', 'ability', 'able', 'abnormal', 'abnormalities', 'abolish', 'abolished', 'abortion', 'abraham', 'absence', 'absent', 'absolute', 'absolutely', 'absolutes', 'absolutist', 'absorbed', 'abstinence', 'absurd', 'absurdity', 'abundant', 'abuse', 'ac', 'academia', 'academic', 'accept', 'acceptable', 'acceptance', 'accepted', 'accepting', 'accepts', 'access', 'accident', 'accidentally', 'accompanied', 'accomplish', 'accomplished', 'according', 'accordingly', 'account', 'accountable', 'accounts', 'accumulated', 'accuracy', 'accurate', 'accurately', 'accusation', 'accusations', 'accuse', 'accused', 'accusing', 'accustomed', 'ache', 'achieve', 'achieved', 'acid', 'acidophilus', 'acids', 'acknowledge', 'acknowledged', 'acknowledgement', 'acknowledges', 'acne', 'acquired', 'acsu', 'act', 'acting', 'action', 'actions', 'active', 'actively', 'activities', 'activity', 'acts', 'actual', 'actually', 'acupunctur

## Use the Chi-Squared test for feature selection

In [12]:
chi = SelectKBest(chi2, k=100)    #get top k features 
X_chi_vec= chi.fit_transform(X_vec, Y)  #fit and transform  training data (TDM) into the reduced feature space

mask = chi.get_support(indices=True) # mask returns a list of indices into the original vocabulary/feature space

mask #print out the mask

NameError: name 'SelectKBest' is not defined

In [None]:
## access the mask
for i in mask[:10]:
    print("index: %d, feature name: %s" % (i, ftr_names[i]))



In [None]:
## create a list of the selected features using the mask
new_ftrs = [] # a list to hold your k best features

for i in mask:
      new_ftrs.append(ftr_names[i])
print("Number of features: %d"  % len(new_ftrs))
print("First 100: %s" % new_ftrs[:100])        
        

## Use Mutual Information for feature selection 
Note that the features selected are different from those selected using chi-squared

In [None]:
mi = SelectKBest(mutual_info_classif, k=100)    #get top k features using mutual information
X_mi_vec= mi.fit_transform(X_vec, Y)  #fit and transform  training data (TDM) into the reduced feature space

## create a list of the selected features using the mask
mask = mi.get_support(indices=True) #get list of indices into the original feature vector

new_ftrs = [] # to hold the list of your K best features
for i in mask:
    new_ftrs.append(ftr_names[i])
print("Number of features: %d"  % len(new_ftrs))
print("First 100: %s" % new_ftrs[:100])      


## Perform classification using chi-squared feature selection

In [None]:
categories = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True, random_state=42)

X, Y = newsgroups_train.data, newsgroups_train.target

vectorizer = TfidfVectorizer()

X_vec = vectorizer.fit_transform(X)   #transform training data

chi = SelectKBest(chi2, k=100)    #get top k features 
X_chi_vec= chi.fit_transform(X_vec, Y)  # fit and transform tdm to reduced feature space

newsgroups_test = fetch_20newsgroups(subset='test',     # get test data
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True,
                                     random_state=42)

vectors_test = vectorizer.transform(newsgroups_test.data)   #transform test data
chi_test = chi.transform(vectors_test)     # transform test data to reduced feature space

classifier = MultinomialNB(alpha=.01)
classifier.fit(X_chi_vec, Y)
predicted = classifier.predict(chi_test)

print(metrics.classification_report(newsgroups_test.target, predicted,
    target_names=newsgroups_train.target_names))