In [24]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score
from gensim.utils import simple_preprocess
from gensim.models import LsiModel, LdaModel
from gensim import corpora
import spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = stopwords.words('english')
sp = spacy.load('en_core_web_lg')

categories = ['rec.autos', 'comp.graphics', 'sci.space']
newsgroup = fetch_20newsgroups(categories=categories, shuffle=True)
data = newsgroup.data
target = newsgroup.target

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
def lemmatizeNLTK(input):
    lemmatizer = WordNetLemmatizer()
    #Tokenizers divide strings into lists of substrings
    wordList = word_tokenize(input)
    output = ' '.join([lemmatizer.lemmatize(w) for w in wordList])
    return output

In [26]:
def lemmatizeSpacy(input):
    doc = sp(input)
    #Tokenizers divide strings into lists of substrings
    output = ' '.join([w.lemma_ for w in doc])
    return output

In [27]:
def NExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_=="NOUN":
            output+=w.text+" "
    output = output[:-1]
    return output

In [28]:
def NAExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_ in ["NOUN","ADJ"]:
            output+=w.text+" "
    output = output[:-1]
    return output

In [29]:
def NAVExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_ in ["NOUN","ADJ","VERB"]:
            output+=w.text+" "
    output = output[:-1]
    return output

In [30]:
def NVExt(input):
    output=""
    doc = sp(input)
    for w in doc:
        if w.pos_ in ["NOUN","VERB"]:
            output+=w.text+" "
    output = output[:-1]
    return output

In [31]:
def train(x_train, x_test, y_train, y_test, clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return f1_score(y_test, y_pred, average='weighted')

In [32]:
def textProcessing(unprocessedTexts, function=None):
    if function is None:
        return unprocessedTexts
    return [function(text) for text in unprocessedTexts]

In [34]:
mapPreProc = ["None","Noun","Noun+Adj","Noun+Adj+Verb","Noun+Verb"]
mapProc = ["None", "LemNLTK", "LemSpacy"]
mapVec = ["BoW","TF-IDF","LSI","LDA"]
mapClf= ["RandomForest","GBM"]
list=[]
for i in range(5):
    if i==0:
        preProc=None
    if i==1:
        preProc=NExt
    if i==2:
        preProc=NAExt
    if i==3:
        preProc=NAVExt
    if i==4:
        preProc=NVExt
    dataExt = textProcessing(data,preProc)
    for j in range(3):
        if j==0:
            proc=None
        if j==1:
            proc=lemmatizeNLTK
        if j==2:
            proc=lemmatizeSpacy
        dataProc = textProcessing(data,proc)
        tokenized_documents = [simple_preprocess(text) for text in dataProc]
        dictionary = corpora.Dictionary(tokenized_documents)
        bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
        lsi = LsiModel(bow_corpus,id2word=dictionary, num_topics=10)
        lda = LdaModel(bow_corpus, num_topics=25, id2word=dictionary,passes=15, minimum_probability = 0)
        for k in range(4):
            if k==0:
                vect = CountVectorizer(binary=False, stop_words=stop_words)
            if k==1:
                vect = TfidfVectorizer(stop_words=stop_words)
            if k==2:
                func = lsi
            if k==3:
                func = lda
            X_train, X_test, y_train, y_test = [],[],[],[]
            if k in [0,1]:
                trainProc, testProc, y_train, y_test = train_test_split(dataProc, target, test_size=0.33)
                X_train = vect.fit_transform(trainProc)
                X_test = vect.transform(testProc)
            else:
                dataVecPre = func[bow_corpus]
                len_data = len(dataVecPre)
                dataVec = [0 for o in range(len_data)]
                for o in range(len_data):
                    curr = []
                    a = dataVecPre[o]
                    length = len(a)
                    curr = [a[u][1] for u in range(length)]
                    dataVec[o] = curr
                X_train, X_test, y_train, y_test = train_test_split(dataVec, target, test_size=0.33)
            for l in range(2):
                if l==0:
                    clf = RandomForestClassifier()
                if l==1:
                    clf = GradientBoostingClassifier(n_estimators=125)
                f1 = train(X_train, X_test, y_train, y_test, clf)
                s = " ".join([mapPreProc[i],mapProc[j],mapVec[k],mapClf[l]])
                list.append([f1,s])
                print(s+": "+str(f1))
    print("-------------------------------------------")
listMax = [el[0] for el in list]
maximum = max(listMax)
print(f"Maximum: {maximum}")
print(list[listMax.index(maximum)][1])

None None BoW RandomForest: 0.9207380337022245
None None BoW GBM: 0.9100258590937079
None None TF-IDF RandomForest: 0.9369635078155392
None None TF-IDF GBM: 0.9252082676953167
None None LSI RandomForest: 0.6853879544659208
None None LSI GBM: 0.7092267247910319
None None LDA RandomForest: 0.6513513493124645
None None LDA GBM: 0.6634030166836462
None LemNLTK BoW RandomForest: 0.9557452449178243
None LemNLTK BoW GBM: 0.9336367708551856
None LemNLTK TF-IDF RandomForest: 0.9442641476070008
None LemNLTK TF-IDF GBM: 0.9274919014659586
None LemNLTK LSI RandomForest: 0.7417694155621958
None LemNLTK LSI GBM: 0.7483189877424599
None LemNLTK LDA RandomForest: 0.7524236221265924
None LemNLTK LDA GBM: 0.7848957090580968
None LemSpacy BoW RandomForest: 0.9438319897143427
None LemSpacy BoW GBM: 0.9370861991540401
None LemSpacy TF-IDF RandomForest: 0.9394686847804505
None LemSpacy TF-IDF GBM: 0.9424790343156554
None LemSpacy LSI RandomForest: 0.6992796513777231
None LemSpacy LSI GBM: 0.7162393162393164