In [6]:
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import f1_score
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LdaModel
from gensim import corpora
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

categories = ['rec.autos', 'comp.graphics', 'sci.space']
newsgroup = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
print(newsgroup.filenames.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(2950,)


In [7]:
tokenized_documents = [simple_preprocess(text) for text in newsgroup.data]
dictionary = corpora.Dictionary(tokenized_documents)
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
modelTf = TfidfModel(bow_corpus)
tf_corpus = [modelTf[corpus_item] for corpus_item in bow_corpus]

In [8]:
def train(X_train, X_test, y_train, y_test, n):
    clf1 = HistGradientBoostingClassifier()
    clf2 = GradientBoostingClassifier(n_estimators=n)
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    y_pred_hist = clf1.predict(X_test)
    y_pred = clf2.predict(X_test)
    return max(f1_score(y_test, y_pred, average='weighted'),f1_score(y_test, y_pred_hist, average='weighted'))

In [None]:
for n in [10,15,20,25,30,40,50,60,70,80,90,100,120,140,160,180,200]:
    for m in [25,50,75,100,125,150,175,200,250,300,350,400]:
        lda = LdaModel(bow_corpus, num_topics=n, id2word=dictionary,passes=15, minimum_probability = 0)
        vectorized_corpus = lda[bow_corpus]
        vectorized_corpus_new = [0 for i in range(2950)]
        for i in range(len(vectorized_corpus)):
            curr = []
            a = vectorized_corpus[i]
            length = len(a)
            curr = [a[j][1] for j in range(length)]
            vectorized_corpus_new[i] = curr
        X_train, X_test, y_train, y_test = train_test_split(vectorized_corpus_new, newsgroup.target, test_size=0.33)
        print(f"N:{n}, M:{m}, result is {train(X_train, X_test, y_train, y_test,m)}")

N:10, M:25, result is 0.7583169607344485
N:10, M:50, result is 0.7842176503624142
N:10, M:75, result is 0.7396950798680805
N:10, M:100, result is 0.7568939723902268
N:10, M:125, result is 0.7817090389585559
N:10, M:150, result is 0.7566359752210825
N:10, M:175, result is 0.7038941916002998
N:10, M:200, result is 0.7579810120900129
N:10, M:250, result is 0.6375497046390576
N:10, M:300, result is 0.7296919683127188
N:10, M:350, result is 0.7324969754782186
N:10, M:400, result is 0.7861570275981028
N:15, M:25, result is 0.7220211118565166
N:15, M:50, result is 0.8140159686192258
N:15, M:75, result is 0.6667159771634039
N:15, M:100, result is 0.7952901571486587
N:15, M:125, result is 0.8016071791574096
N:15, M:150, result is 0.7531706924092906
N:15, M:175, result is 0.792867235591882
N:15, M:200, result is 0.8130392293508287
N:15, M:250, result is 0.7747472094200003
N:15, M:300, result is 0.7433328896152742
N:15, M:350, result is 0.7696813700464329
N:15, M:400, result is 0.6723495159454135