In [42]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score
from gensim.utils import simple_preprocess
from gensim.models import LsiModel, LdaModel
from gensim import corpora
import spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = stopwords.words('english')
sp = spacy.load('en_core_web_lg')

categories = ['rec.autos', 'comp.graphics', 'sci.space']
newsgroup = fetch_20newsgroups(categories=categories, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(newsgroup.data, newsgroup.target, test_size=0.33)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
def lemmatizeNLTK(input):
    lemmatizer = WordNetLemmatizer()
    #Tokenizers divide strings into lists of substrings
    wordList = word_tokenize(input)
    output = ' '.join([lemmatizer.lemmatize(w) for w in wordList])
    return output

In [44]:
def lemmatizeSpacy(input):
    doc = sp(input)
    #Tokenizers divide strings into lists of substrings
    output = ' '.join([w.lemma_ for w in doc])
    return output

In [45]:
def NExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_=="NOUN":
            output+=w.text+" "
    output = output[:-1]
    return output

In [46]:
def NAExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_ in ["NOUN","ADJ"]:
            output+=w,text+" "
    output = output[:-1]
    return output

In [47]:
def NAVExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_ in ["NOUN","ADJ","VERB"]:
            output+=w.text+" "
    output = output[:-1]
    return output

In [48]:
def NVExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_ in ["NOUN","VERB"]:
            output+=w.text+" "
    output = output[:-1]
    return output

In [49]:
def train(x_train, x_test, y_train, y_test, clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return f1_score(y_test, y_pred, average='weighted')

In [50]:
def textProcessing(unprocessedTexts, function=None):
    if function is None:
        return unprocessedTexts
    return [function(text) for text in unprocessedTexts]

In [None]:
mapPreProc = ["None","Noun","Noun+Adj","Noun+Adj+Verb","Noun+Verb"]
mapProc = ["None", "LemNLTK", "LemSpacy"]
mapVec = ["BoW","TF-IDF","LSI","LDA"]
mapClf= ["RandomForest","GBM"]
list=[]
for i in range(4):
    if i==0:
        preProc=None
    if i==1:
        preProc=NExt
    if i==2:
        preProc=NAExt
    if i==3:
        preProc=NAVExt
    if i==4:
        preProc=NVExt
    trainTarget = y_train
    testTarget = y_test
    trainExt = textProcessing(X_train,preProc)
    testExt = textProcessing(X_test,preProc)
    for j in range(3):
        if j==0:
            proc=None
        if j==1:
            proc=lemmatizeNLTK
        if j==2:
            proc=lemmatizeSpacy
        trainProc = textProcessing(trainExt,proc)
        testProc = textProcessing(testExt,proc)
        tokenized_documents = [simple_preprocess(text) for text in trainProc+testProc]
        dictionary = corpora.Dictionary(tokenized_documents)
        bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
        tokTrain = [simple_preprocess(text) for text in trainProc]
        tokTest = [simple_preprocess(text) for text in testProc]
        trainBoW = [dictionary.doc2bow(doc) for doc in tokTrain]
        testBoW = [dictionary.doc2bow(doc) for doc in tokTest]
        lsi = LsiModel(bow_corpus,id2word=dictionary, num_topics=10)
        lda = LdaModel(bow_corpus, num_topics=25, id2word=dictionary,passes=15, minimum_probability = 0)
        for k in range(4):
            if k==0:
                vect = CountVectorizer(binary=False, stop_words=stop_words)
            if k==1:
                vect = TfidfVectorizer(stop_words=stop_words)
            if k==2:
                func = lsi
            if k==3:
                func = lda
            if k in [0,1]:
                trainVec = vect.fit_transform(trainProc)
                testVec = vect.transform(testProc)
            else:
                trainVecPre = func[trainBoW]
                testVecPre = func[testBoW]
                trainVec = []
                for p in range(len(trainVecPre)):
                    curr = []
                    a = trainVecPre[p]
                    length = len(a)
                    curr = np.array([a[b][1] for b in range(length)])
                    trainVec.append(curr)
                testVec = []
                for p in range(len(testVecPre)):
                    curr = []
                    a = trainVecPre[p]
                    length = len(a)
                    curr = np.array([a[b][1] for b in range(length)])
                    testVec.append(curr)
                for i in range(len(trainVec)-1,0,-1):
                    if len(trainVec[i])==0:
                        trainVec.pop(i)
                        trainTarget = np.delete(trainTarget, i, 0)
                for i in range(len(testVec)-1,0,-1):
                    if len(testVec[i])==0:
                        testVec.pop(i)
                        testTarget = np.delete(testTarget, i, 0)
            for l in range(2):
                if l==0:
                    clf = RandomForestClassifier()
                if l==1:
                    clf = GradientBoostingClassifier(n_estimators=125)
                f1 = train(trainVec, testVec, trainTarget, testTarget,clf)
                s = " ".join([mapPreProc[i],mapProc[j],mapVec[k],mapClf[l]])
                list.append([f1,s])
                print(s+": "+str(f1))
listMax = [el[0] for el in list]
maximum = max(listMax)
print(f"Maximum: {maximum}")

None None BoW RandomForest: 0.9507556984207832
None None BoW GBM: 0.9489297483095581
None None TF-IDF RandomForest: 0.9524163163077691
None None TF-IDF GBM: 0.942089878868001
Noun None LSI RandomForest: 0.32917755634865115
Noun None LSI GBM: 0.33365419975349053
Noun None LDA RandomForest: 0.32585453053917934
Noun None LDA GBM: 0.3270353861985508
