## Set up Environment 

In [19]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

## Load data and generate TDM

The vectoriser below uses the `token_pattern` parameter to remove numerics and underscores from the data.
Try it out with the parameter specified and not specified (i.e. using the default which uses a regex of `(?u)\b\w\w+\b` to see the impact on the tokens/features generated.




In [7]:
categories = ['talk.religion.misc','soc.religion.christian', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True, random_state=42)

X, Y = newsgroups_train.data, newsgroups_train.target

vectorizer = TfidfVectorizer(token_pattern=r'\b[^\d^\_\W]+\b')    

X_vec = vectorizer.fit_transform(X)   #transform training data into TDM


### Examine the features generated

In [9]:
ftr_names= vectorizer.get_feature_names()
print("Number of features: %d"  % len(ftr_names))
print("First 100: %s" % ftr_names[:100])


Number of features: 22059
First 100: ['a', 'aa', 'aaai', 'aacc', 'aanerud', 'aaron', 'aaronson', 'aasked', 'ab', 'abacus', 'abandon', 'abandoned', 'abandoning', 'abandons', 'abates', 'abba', 'abbasids', 'abbott', 'abbreviated', 'abbreviation', 'abd', 'abdel', 'abdomen', 'abdominal', 'abduction', 'abdullah', 'abeit', 'aberdeen', 'aberrant', 'aberration', 'aberrations', 'abhin', 'abhor', 'abhorent', 'abhorrent', 'abide', 'abideth', 'abiding', 'abilities', 'ability', 'abingdon', 'abington', 'abiogenesis', 'abjuring', 'ablazing', 'able', 'ably', 'abner', 'abnormal', 'abnormalities', 'abnormally', 'aboard', 'abode', 'abodes', 'abolish', 'abolished', 'abolishment', 'abolition', 'abolitionist', 'abolitionists', 'abomb', 'abomination', 'abortion', 'abou', 'abound', 'abounded', 'about', 'above', 'abput', 'abraam', 'abraham', 'abrahamic', 'abram', 'abreast', 'abri', 'abridged', 'abroad', 'abruptly', 'abscence', 'abscess', 'absence', 'absent', 'absol', 'absolute', 'absolutely', 'absolutes', 'abso

### Remove Stopwords  and add in Document Frequency reduction

In [13]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=3, token_pattern=r'\b[^\d^\_\W]+\b',stop_words="english")
X_vec = vectorizer.fit_transform(X)   #transform training data

ftr_names= vectorizer.get_feature_names()
print("Number of features: %d"  % len(ftr_names))
print("First 100: %s" % ftr_names[:100])



Number of features: 7163
First 100: ['aaron', 'abandon', 'abandoned', 'abdominal', 'aberrant', 'ability', 'able', 'abnormal', 'abnormalities', 'abolish', 'abolished', 'abortion', 'abraham', 'absence', 'absent', 'absolute', 'absolutely', 'absolutes', 'absolutist', 'absorbed', 'abstinence', 'absurd', 'absurdity', 'abundant', 'abuse', 'ac', 'academia', 'academic', 'accept', 'acceptable', 'acceptance', 'accepted', 'accepting', 'accepts', 'access', 'accident', 'accidentally', 'accompanied', 'accomplish', 'accomplished', 'according', 'accordingly', 'account', 'accountable', 'accounts', 'accumulated', 'accuracy', 'accurate', 'accurately', 'accusation', 'accusations', 'accuse', 'accused', 'accusing', 'accustomed', 'ache', 'achieve', 'achieved', 'acid', 'acidophilus', 'acids', 'acknowledge', 'acknowledged', 'acknowledgement', 'acknowledges', 'acne', 'acquired', 'acsu', 'act', 'acting', 'action', 'actions', 'active', 'actively', 'activities', 'activity', 'acts', 'actual', 'actually', 'acupunctur

## Use the Chi-Squared test for feature selection

In [14]:
chi = SelectKBest(chi2, k=100)    #get top k features 
X_chi_vec= chi.fit_transform(X_vec, Y)  #fit and transform  training data (TDM) into the reduced feature space

mask = chi.get_support(indices=True) # mask returns a list of indices into the original vocabulary/feature space

mask #print out the mask

array([ 208,  242,  485,  530,  558,  599,  602,  634,  804,  827,  828,
        872,  950,  958,  974,  987,  990,  991,  992, 1002, 1380, 1748,
       1765, 1833, 1834, 1893, 1974, 2002, 2026, 2038, 2205, 2364, 2402,
       2413, 2539, 2540, 2665, 2730, 2746, 2860, 2890, 2902, 2915, 3027,
       3232, 3304, 3424, 3445, 3449, 3519, 3561, 3618, 3760, 3775, 3802,
       3841, 3846, 3877, 3879, 3884, 3943, 3946, 4016, 4116, 4117, 4122,
       4159, 4240, 4300, 4355, 4537, 4552, 4609, 4610, 4614, 4711, 4732,
       4798, 5160, 5357, 5499, 5590, 5695, 5715, 5758, 5833, 5902, 5932,
       5997, 6063, 6318, 6349, 6350, 6637, 6669, 6702, 6854, 7017, 7019,
       7142])

In [15]:
## access the mask
for i in mask[:10]:
    print("index: %d, feature name: %s" % (i, ftr_names[i]))



index: 208, feature name: allergic
index: 242, feature name: amorc
index: 485, feature name: authority
index: 530, feature name: banks
index: 558, feature name: batf
index: 599, feature name: belief
index: 602, feature name: believe
index: 634, feature name: bible
index: 804, feature name: cadre
index: 827, feature name: cancer


In [16]:
## create a list of the selected features using the mask
new_ftrs = [] # a list to hold your k best features

for i in mask:
      new_ftrs.append(ftr_names[i])
print("Number of features: %d"  % len(new_ftrs))
print("First 100: %s" % new_ftrs[:100])        
        

Number of features: 100
First 100: ['allergic', 'amorc', 'authority', 'banks', 'batf', 'belief', 'believe', 'bible', 'cadre', 'cancer', 'candida', 'catholic', 'chastity', 'cheers', 'chinese', 'christ', 'christian', 'christianity', 'christians', 'church', 'corn', 'diagnosed', 'diet', 'disease', 'diseases', 'doctor', 'dsl', 'easter', 'edu', 'effects', 'eternal', 'faith', 'father', 'fbi', 'food', 'foods', 'geb', 'god', 'gordon', 'hare', 'health', 'heaven', 'hell', 'hudson', 'information', 'intellect', 'james', 'jesus', 'jim', 'kent', 'koresh', 'lds', 'lord', 'love', 'lyme', 'malcolm', 'man', 'marriage', 'married', 'mary', 'medical', 'medicine', 'migraine', 'moral', 'morality', 'mormons', 'msg', 'needles', 'normal', 'objective', 'pain', 'paradise', 'patient', 'patients', 'paul', 'physician', 'pitt', 'pope', 'ra', 'religion', 'resurrection', 'rosicrucian', 'sci', 'scripture', 'seizures', 'shameful', 'sin', 'skepticism', 'soon', 'spirit', 'surrender', 'symptoms', 'syndrome', 'treatment', 'tr

## Use Mutual Information for feature selection 
Note that the features selected are different from those selected using chi-squared

In [17]:
mi = SelectKBest(mutual_info_classif, k=100)    #get top k features using mutual information
X_mi_vec= mi.fit_transform(X_vec, Y)  #fit and transform  training data (TDM) into the reduced feature space

## create a list of the selected features using the mask
mask = mi.get_support(indices=True) #get list of indices into the original feature vector

new_ftrs = [] # to hold the list of your K best features
for i in mask:
    new_ftrs.append(ftr_names[i])
print("Number of features: %d"  % len(new_ftrs))
print("First 100: %s" % new_ftrs[:100])      


Number of features: 100
First 100: ['actually', 'agree', 'believe', 'best', 'better', 'bible', 'called', 'case', 'christ', 'christian', 'christians', 'church', 'come', 'course', 'd', 'day', 'did', 'didn', 'different', 'does', 'doesn', 'doing', 'don', 'e', 'edu', 'evidence', 'example', 'fact', 'faith', 'far', 'given', 'god', 'going', 'good', 'great', 'group', 'having', 'help', 'jesus', 'just', 'know', 'let', 'life', 'like', 'little', 'll', 'long', 'look', 'lot', 'love', 'm', 'make', 'man', 'matter', 'mean', 'mind', 'need', 'new', 'non', 'people', 'person', 'place', 'point', 'post', 'probably', 'problem', 'question', 'quite', 'read', 'real', 'really', 'reason', 'religion', 'right', 's', 'said', 'say', 'saying', 'says', 'sure', 't', 'tell', 'thing', 'things', 'think', 'time', 'true', 'truth', 'try', 'trying', 'use', 'used', 've', 'want', 'way', 'word', 'work', 'world', 'wrong', 'years']


## Perform classification using chi-squared feature selection

In [18]:
categories = ['rec.sport.baseball', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True, random_state=42)

X, Y = newsgroups_train.data, newsgroups_train.target

vectorizer = TfidfVectorizer()

X_vec = vectorizer.fit_transform(X)   #transform training data

chi = SelectKBest(chi2, k=100)    #get top k features 
X_chi_vec= chi.fit_transform(X_vec, Y)  # fit and transform tdm to reduced feature space

newsgroups_test = fetch_20newsgroups(subset='test',     # get test data
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'),
                                     shuffle=True,
                                     random_state=42)

vectors_test = vectorizer.transform(newsgroups_test.data)   #transform test data
chi_test = chi.transform(vectors_test)     # transform test data to reduced feature space

classifier = MultinomialNB(alpha=.01)
classifier.fit(X_chi_vec, Y)
predicted = classifier.predict(chi_test)

print(metrics.classification_report(newsgroups_test.target, predicted,
    target_names=newsgroups_train.target_names))

                    precision    recall  f1-score   support

rec.sport.baseball       0.97      0.66      0.79       397
  rec.sport.hockey       0.75      0.98      0.85       399

          accuracy                           0.82       796
         macro avg       0.86      0.82      0.82       796
      weighted avg       0.86      0.82      0.82       796

