In [130]:
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LdaModel
from gensim import corpora
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

categories = ['rec.autos', 'comp.graphics', 'sci.space']
newsgroup = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
print(newsgroup.filenames.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(2950,)


In [131]:
tokenized_documents = [simple_preprocess(text) for text in newsgroup.data]
dictionary = corpora.Dictionary(tokenized_documents)
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
modelTf = TfidfModel(bow_corpus)
tf_corpus = [modelTf[corpus_item] for corpus_item in bow_corpus]

In [132]:
def train(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return f1_score(y_test, y_pred, average='macro')

In [134]:
for n in [15,20,25,30,40,50,75,80,85,90,100,110,120,140,150,170,180,190,200]:
    lda = LdaModel(tf_corpus, num_topics=n, id2word=dictionary,passes=15, minimum_probability = 0)
    vectorized_corpus = lda[tf_corpus]
    vectorized_corpus_new = [0 for i in range(2950)]
    for i in range(len(vectorized_corpus)):
        curr = []
        a = vectorized_corpus[i]
        length = len(a)
        curr = [a[j][1] for j in range(n)]
        vectorized_corpus_new[i] = curr
    X_train, X_test, y_train, y_test = train_test_split(vectorized_corpus_new, newsgroup.target, test_size=0.33)
    print(f"N:{n},result is {train(X_train, X_test, y_train, y_test)}")

N:15,result is 0.5484808136068366
N:20,result is 0.4608220302375758
N:25,result is 0.49287386055678734
N:30,result is 0.4843593410237233
N:40,result is 0.5581355556950541
N:50,result is 0.5216011295369951
N:75,result is 0.5576574937956097
N:80,result is 0.530712778385475
N:85,result is 0.5843322984228807
N:90,result is 0.6286907409655756
N:100,result is 0.607895615253138
N:110,result is 0.6084458959465465
N:120,result is 0.6223888243894756
N:140,result is 0.5794906899622526
N:150,result is 0.5484212178242035
N:170,result is 0.6727900344881567
N:180,result is 0.6394013537863719
N:190,result is 0.6401426308097672
N:200,result is 0.6263596067733478
