In [65]:
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from tabulate import tabulate
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = stopwords.words('english')


categories = ['rec.autos', 'comp.graphics', 'sci.space']
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.33, random_state=42)
print(len(X_train))
print(len(X_test))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1976
974


In [66]:
X_train[0:3]

['From: teckjoo@iti.gov.sg (Chua Teck Joo)\nSubject: Visuallib (3D graphics for Windows)\nOrganization: Information Technology Institute, National Computer Board, Singapore.\nLines: 17\n\n\nI am currently looking for a 3D graphics library that runs on MS\nWindows 3.1.  Are there any such libraries out there other than\nVisuallib?  (It must run on VGA and should not require any other\nadd-on graphics cards).\n\nFor Visuallib, will it run with Metaware High C compiler v3.0?  Any\nemail contact for the author of Visuallib?\n\nAny help would be much appreciated.  Thanks.\n\n\n-- \n* Chua, Teck Joo\t    | Information Technology Institute *\n* Email: teckjoo@iti.gov.sg | 71 Science Park Drive\t       *\n* Phone: (65) 772-0237 \t    | Singapore (0511)\t\t       *\n* Fax:   (65) 779-1827      |\t\t\t   \t       *\n',
 'From: graeme@labtam.labtam.oz.au (Graeme Gill)\nSubject: Re: looking for circle algorithm faster than Bresenhams\nOrganization: Labtam Australia Pty. Ltd., Melbourne, Australia\

In [67]:
y_train[0:3]

array([0, 0, 1], dtype=int64)

In [68]:
def lemmatizeText(input):
    lemmatizer = WordNetLemmatizer()
    #Tokenizers divide strings into lists of substrings
    wordList = word_tokenize(input)
    output = ' '.join([lemmatizer.lemmatize(w) for w in wordList])
    return output

In [69]:
def stemText(input):
    stemmer = PorterStemmer()
    wordList = word_tokenize(input)
    output = ' '.join([stemmer.stem(w) for w in wordList])
    return output

In [70]:
def train(X_train, X_test, y_train, y_test, n):
    X_train = sp.sparse.csr_matrix(X_train)
    X_test = sp.sparse.csr_matrix(X_test)
    clf = GradientBoostingClassifier(n_estimators=n)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return f1_score(y_test, y_pred, average='weighted')

In [71]:
def textProcessing(unprocessedTexts, function=None):
    if function is None:
        return unprocessedTexts
    return [function(text) for text in unprocessedTexts]

In [75]:
for n in [50,75,100,200,500,750,1000,1250,1500,1750,2000]:
    tableText = [["" for j in range(4)] for i in range(4)]
    tableText[0] = [f"n={n}", "None", "Stemming", "Lemmatizer"]
    tableText[1][0] = "One-hot"
    tableText[2][0] = "Bag of words"
    tableText[3][0] = "Tf-idf"
    for i in range(3):
        if i==0:
            func = None
        if i==1:
            func = stemText
        if i==2:
            func = lemmatizeText
        preprocessedTrain = textProcessing(X_train, func)
        preprocessedTest = textProcessing(X_test, func)
        for j in range(3):
            if j==0:
                vect = CountVectorizer(binary=True, stop_words=stop_words)
            if j==1:
                vect = CountVectorizer(binary=False, stop_words=stop_words)
            if j==2:
                vect = TfidfVectorizer(stop_words=stop_words)
            train_vec = vect.fit_transform(preprocessedTrain)
            test_vec = vect.transform(preprocessedTest)
            
            f1 = train(train_vec, test_vec, y_train, y_test, n)
            tableText[j+1][i+1] = f1
    list = [max(tableText[i][1:]) for i in range(1,4)]
    maximum = max(list)
    print(f"Maximum: {maximum}")
    print(tabulate(tableText))

Maximum: 0.924249911420418
------------  ------------------  ------------------  ------------------
n=50          None                Stemming            Lemmatizer
One-hot       0.924249911420418   0.9210317641006033  0.922407580396648
Bag of words  0.9200817488560322  0.9210802312482006  0.9234818039400423
Tf-idf        0.9200922370478629  0.9210377980220511  0.9151860564383651
------------  ------------------  ------------------  ------------------
Maximum: 0.9384247209259112
------------  ------------------  ------------------  ------------------
n=75          None                Stemming            Lemmatizer
One-hot       0.9374857699659098  0.9384247209259112  0.9345007312056554
Bag of words  0.9354528710662061  0.93735513479614    0.9355700165951459
Tf-idf        0.9283467704688332  0.9332002494964734  0.9273288443830497
------------  ------------------  ------------------  ------------------
Maximum: 0.9445475215039638
------------  ------------------  ------------------  ----