In [36]:
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LsiModel
from gensim import corpora
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

categories = ['rec.autos', 'comp.graphics', 'sci.space']
newsgroup = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
print(newsgroup.filenames.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(2950,)


In [37]:
tokenized_documents = [simple_preprocess(text) for text in newsgroup.data]
dictionary = corpora.Dictionary(tokenized_documents)
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
modelTf = TfidfModel(bow_corpus)
tf_corpus = [modelTf[corpus_item] for corpus_item in bow_corpus]

In [38]:
n = 10
lsi_model = LsiModel(tf_corpus,id2word=dictionary, num_topics=n)
vectorized_corpus = lsi_model[tf_corpus]

In [39]:
vectorized_corpus_new = []
for i in range(len(vectorized_corpus)):
    curr = []
    for j in range(len(vectorized_corpus[i])):
        curr.append(vectorized_corpus[i][j][1])
    curr = np.array(curr)
    vectorized_corpus_new.append(curr)
print(vectorized_corpus_new[0:3])
print(len(vectorized_corpus_new))
X_train, X_test, y_train, y_test = train_test_split(vectorized_corpus_new, newsgroup.target, test_size=0.33)
for i in range(len(X_train)-1,0,-1):
    if len(X_train[i])!=n:
        X_train.pop(i)
        y_train = np.delete(y_train, i, 0)
for i in range(len(X_test)-1,0,-1):
    if len(X_test[i])!=n:
        X_test.pop(i)
        y_test = np.delete(y_test, i, 0)

[array([ 0.088603  ,  0.02759086, -0.01090217, -0.02113932, -0.01107879,
        0.03481957, -0.033835  ,  0.01503481,  0.04508706, -0.00781211]), array([ 0.22122599,  0.09900455,  0.0738135 ,  0.02001531,  0.14174494,
       -0.10231179,  0.10672488,  0.01477079, -0.00629975, -0.0071981 ]), array([ 0.13278458,  0.05741216,  0.04369362,  0.02890696,  0.04499401,
       -0.00664067,  0.00151987,  0.0031437 ,  0.03665715,  0.00755519])]
2950


In [40]:
def train(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return f1_score(y_test, y_pred, average='weighted')

In [41]:
print(len(X_train)==len(y_train) and len(X_test)==len(y_test))
print(train(X_train, X_test, y_train, y_test))

True
0.8157443053308473
