In [81]:
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from tabulate import tabulate
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = stopwords.words('english')


categories = ['rec.autos', 'comp.graphics', 'sci.space']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)
x_train = train.data
y_train = train.target
x_test = test.data
y_test = test.target
print(len(x_train))
print(len(x_test))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1771
1179


In [82]:
x_train[0:3]

["From: prb@access.digex.com (Pat)\nSubject: Re: Abyss: breathing fluids\nArticle-I.D.: access.1psghn$s7r\nOrganization: Express Access Online Communications USA\nLines: 19\nNNTP-Posting-Host: access.digex.net\n\nIn article <C4t3K3.498@cck.coventry.ac.uk> enf021@cck.coventry.ac.uk (Achurist) writes:\n|\n|I believe the reason is that the lung diaphram gets too tired to pump\n|the liquid in and out and simply stops breathing after 2-3 minutes.\n|So if your in the vehicle ready to go they better not put you on \n|hold, or else!! That's about it. Remember a liquid is several more times\n|as dense as a gas by its very nature. ~10 I think, depending on the gas\n|and liquid comparision of course!\n\n\nCould you use some sort of mechanical chest compression as an aid.\nSorta like the portable Iron Lung?   Put some sort of flex tubing\naround the 'aquanauts' chest.  Cyclically compress it  and it will\npush enough on the chest wall to support breathing?????\n\nYou'd have to trust your breather,

In [83]:
y_train[0:3]

array([2, 0, 2], dtype=int64)

In [84]:
def lemmatizeText(input):
    lemmatizer = WordNetLemmatizer()
    #Tokenizers divide strings into lists of substrings
    wordList = word_tokenize(input)
    output = ' '.join([lemmatizer.lemmatize(w) for w in wordList])
    return output

In [85]:
def stemText(input):
    stemmer = PorterStemmer()
    wordList = word_tokenize(input)
    output = ' '.join([stemmer.stem(w) for w in wordList])
    return output

In [86]:
def train(x_train, x_test, y_train, y_test, n):
    clf = DecisionTreeClassifier(random_state=n)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return f1_score(y_test, y_pred, average='weighted')

In [87]:
def textProcessing(unpocessedTexts, function=None):
    if function is None:
        return unpocessedTexts
    return [function(text) for text in unpocessedTexts]

In [88]:
for n in [10,20,30,40,50,60,70,80]:
    tableText = [["" for j in range(4)] for i in range(4)]
    tableText[0] = [f"n={n}", "None", "Stemming", "Lemmatizer"]
    tableText[1][0] = "One-hot"
    tableText[2][0] = "Bag of words"
    tableText[3][0] = "Tf-idf"
    for i in range(3):
        if i==0:
            func = None
        if i==1:
            func = stemText
        if i==2:
            func = lemmatizeText
        preprocessedTrain = textProcessing(x_train, func)
        preprocessedTest = textProcessing(x_test, func)
        for j in range(3):
            if j==0:
                vect = CountVectorizer(binary=True, stop_words=stop_words)
            if j==1:
                vect = CountVectorizer(binary=False, stop_words=stop_words)
            if j==2:
                vect = TfidfVectorizer(stop_words=stop_words)
            train_vec = vect.fit_transform(preprocessedTrain)
            test_vec = vect.transform(preprocessedTest)

            f1 = train(train_vec, test_vec, y_train, y_test, n)
            tableText[j+1][i+1] = f1
    list = [max(tableText[i][1:]) for i in range(1,4)]
    maximum = max(list)
    print(f"Maximum: {maximum}")
    print(tabulate(tableText))

Maximum: 0.8306363451454507
------------  ------------------  ------------------  ------------------
n=10          None                Stemming            Lemmatizer
One-hot       0.8252365096951894  0.8094146391955721  0.8306363451454507
Bag of words  0.8244443231912999  0.8262304473291578  0.8060278518435906
Tf-idf        0.8179071050196688  0.8071139561958257  0.8109129939261458
------------  ------------------  ------------------  ------------------
Maximum: 0.834755964565014
------------  ------------------  ------------------  ------------------
n=20          None                Stemming            Lemmatizer
One-hot       0.8303187281517309  0.824595635867611   0.8086049828896322
Bag of words  0.834755964565014   0.8215246741694298  0.8220792009567596
Tf-idf        0.8145501817775157  0.8015531808896347  0.8117518482373658
------------  ------------------  ------------------  ------------------
Maximum: 0.8286168694591729
------------  ------------------  ------------------  ---