In [2]:
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from tabulate import tabulate
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = stopwords.words('english')


categories = ['rec.autos', 'comp.graphics', 'sci.space']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers','footer','quotes'))
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, remove=('headers','footer','quotes'))
x_train = train.data
y_train = train.target
x_test = test.data
y_test = test.target
print(len(x_train))
print(len(x_test))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1771
1179


In [3]:
x_train[0:3]

["\n\nCould you use some sort of mechanical chest compression as an aid.\nSorta like the portable Iron Lung?   Put some sort of flex tubing\naround the 'aquanauts' chest.  Cyclically compress it  and it will\npush enough on the chest wall to support breathing?????\n\nYou'd have to trust your breather,  but in space, you have to trust\nyour suit anyway.\n\npat\n",
 "\n\nI too would like a 3D graphics library!  How much do C libraries cost\nanyway?  Can you get the tools used by, say, RenderMan, and can you get\nthem at a reasonable cost?\n\nSorry that I don't have any answers, just questions...\n\nMatt Madsen\nmmadsen@ics.uci.edu\n\n",
 '\nFrom: <tom>\nSubject: computer cult\n\nFrom scott Fri Apr 23 16:31:21 1993\nReceived: by igc.apc.org (4.1/Revision: 1.77 )\n\tid AA16121; Fri, 23 Apr 93 16:31:09 PDT\nDate: Fri, 23 Apr 93 16:31:09 PDT\nMessage-Id: <9304232331.AA16121@igc.apc.org>\nFrom: Scott Weikart <scott>\nSender: scott\nTo: cdplist\nSubject: Next stand-off?\nStatus: R\n\nRedwood C

In [4]:
y_train[0:3]

array([2, 0, 2], dtype=int64)

In [5]:
def lemmatizeText(input):
    lemmatizer = WordNetLemmatizer()
    #Tokenizers divide strings into lists of substrings
    wordList = word_tokenize(input)
    output = ' '.join([lemmatizer.lemmatize(w) for w in wordList])
    return output

In [6]:
def stemText(input):
    stemmer = PorterStemmer()
    wordList = word_tokenize(input)
    output = ' '.join([stemmer.stem(w) for w in wordList])
    return output

In [7]:
def train(x_train, x_test, y_train, y_test, n):
    clf = DecisionTreeClassifier(random_state=n)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return f1_score(y_test, y_pred, average='weighted')

In [8]:
def textProcessing(unprocessedTexts, function=None):
    if function is None:
        return unprocessedTexts
    return [function(text) for text in unprocessedTexts]

In [10]:
for m in [5,10,15,20,25,30]:
    for n in [30,40,50,60]:
        tableText = [["" for j in range(4)] for i in range(4)]
        tableText[0] = [f"n={n};m={m}", "None", "Stemming", "Lemmatizer"]
        tableText[1][0] = "One-hot"
        tableText[2][0] = "Bag of words"
        tableText[3][0] = "Tf-idf"
        for i in range(3):
            if i==0:
                func = None
            if i==1:
                func = stemText
            if i==2:
                func = lemmatizeText
            preprocessedTrain = textProcessing(x_train, func)
            preprocessedTest = textProcessing(x_test, func)
            for j in range(3):
                if j==0:
                    vect = CountVectorizer(binary=True, stop_words=stop_words)
                if j==1:
                    vect = CountVectorizer(binary=False, stop_words=stop_words)
                if j==2:
                    vect = TfidfVectorizer(stop_words=stop_words)
                train_vec = vect.fit_transform(preprocessedTrain)
                test_vec = vect.transform(preprocessedTest)
    
                f1 = train(train_vec, test_vec, y_train, y_test, n)
                tableText[j+1][i+1] = f1
        list = [max(tableText[i][1:]) for i in range(1,4)]
        maximum = max(list)
        print(f"Maximum: {maximum}")
        print(tabulate(tableText))

Maximum: 0.7575131696218897
------------  ------------------  ------------------  ------------------
n=30;m=5      None                Stemming            Lemmatizer
One-hot       0.7563519548225734  0.7506792262354128  0.7439125836283446
Bag of words  0.7571307245123651  0.7575131696218897  0.7515069047911325
Tf-idf        0.729577193823154   0.7396189071321255  0.7446689600034061
------------  ------------------  ------------------  ------------------
Maximum: 0.7584707003601143
------------  ------------------  ------------------  ------------------
n=40;m=5      None                Stemming            Lemmatizer
One-hot       0.7453063176947534  0.7447810342816077  0.7550361116016518
Bag of words  0.7542755529677643  0.7584707003601143  0.7514507206988321
Tf-idf        0.7313810969218506  0.7396219265171883  0.7395484918977683
------------  ------------------  ------------------  ------------------
Maximum: 0.7556785832828575
------------  ------------------  ------------------  --