In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gensim.parsing import preprocessing, remove_stopwords

In [2]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [3]:
df = pd.DataFrame(
    columns=['news']
)

In [4]:
df['news'] = train_data['data']
df['target'] = train_data['target']
df.head()

Unnamed: 0,news,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [5]:
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [6]:
df_test = pd.DataFrame(
    columns=['news']
)

In [7]:
df_test['news'] = test_data['data']
df_test['target'] = test_data['target']

In [8]:
df_test.head()

Unnamed: 0,news,target
0,I am a little confused on all of the models of...,7
1,I'm not familiar at all with the format of the...,5
2,"\nIn a word, yes.\n",0
3,\nThey were attacking the Iraqis to drive them...,17
4,\nI've just spent two solid months arguing tha...,19


In [9]:
FILTERS = [
    preprocessing.strip_numeric,
    preprocessing.strip_punctuation,
    preprocessing.strip_tags,
    preprocessing.strip_non_alphanum,
    preprocessing.strip_multiple_whitespaces,
    preprocessing.stem_text
]

In [10]:
def cleaning(string):
    word_list = preprocessing.preprocess_string(string, FILTERS)
    return ' '.join(word.lower() for word in word_list)

In [11]:
train = []
for i in train_data['data']:
    train.append(cleaning(i))

In [12]:
test = []
for i in test_data['data']:
    test.append(cleaning(i))

In [13]:
df['clean_data'] = train

In [14]:
df.head()

Unnamed: 0,news,target,clean_data
0,I was wondering if anyone out there could enli...,7,i wa wonder if anyon out there could enlighten...
1,A fair number of brave souls who upgraded thei...,4,a fair number of brave soul who upgrad their s...
2,"well folks, my mac plus finally gave up the gh...",4,well folk my mac plu final gave up the ghost t...
3,\nDo you have Weitek's address/phone number? ...,1,do you have weitek s address phone number i d ...
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,from articl cowcb np world std com by tombak w...


In [14]:
df_test['clean_test_data'] = test

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer()

In [17]:
tfidf_fit = tfidf.fit_transform(df['clean_data'])

In [18]:
dense = tfidf_fit.todense()

In [19]:
data = pd.DataFrame(
    data=dense,
)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60113,60114,60115,60116,60117,60118,60119,60120,60121,60122
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
tfidf_test = tfidf.transform(df_test['clean_test_data'])

In [21]:
test_dense = tfidf_test.todense()

In [22]:
data_test = pd.DataFrame(
    data=test_dense,
)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60113,60114,60115,60116,60117,60118,60119,60120,60121,60122
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
rfc = RandomForestClassifier(n_jobs=-1, verbose=2)

In [None]:
rfc.fit(data, df.iloc[:, -1])

