In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_20newsgroups

In [26]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [27]:
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [28]:
train_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [29]:
train_data.target

array([7, 4, 4, ..., 3, 1, 8])

In [35]:
df = pd.DataFrame(
    columns=['news']
)

In [36]:
df['news'] = train_data['data']
df['target'] = train_data['target']

In [42]:
df.head()

Unnamed: 0,news,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [22]:
df.describe()

Unnamed: 0,target
count,11314.0
mean,9.293
std,5.562719
min,0.0
25%,5.0
50%,9.0
75%,14.0
max,19.0


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
x = [
    "Whether or not to shuffle the data", "might be important for models that", "make the assumption that the",
    "samples are independent", "f a string, it is passed to _check_stop_list", "and the appropriate stop list is returned", 
    "‘english’ is currently the only supported", "string value.", "May contain any subset of (‘headers’, ‘footers’, ‘quotes’)",
    "Each of these are kinds of text that will be detected", "and removed from the newsgroup posts, preventing", "classifiers from overfitting on metadata.",
    "Remove accents and perform other", "character normalization during the preprocessing", "step. ‘ascii’ is a fast method that only", "works on characters that have", "an direct ASCII mapping."
]
tfidfvec = TfidfVectorizer()

In [58]:
f = tfidfvec.fit_transform(x)

In [59]:
tfidfvec.idf_

array([3.19722458, 3.19722458, 3.19722458, 2.5040774 , 3.19722458,
       3.19722458, 2.79175947, 2.79175947, 3.19722458, 2.79175947,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 2.79175947,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 2.28093385,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 3.19722458, 2.79175947, 2.79175947,
       2.79175947, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 3.19722458, 3.19722458, 3.19722458,
       3.19722458, 3.19722458, 2.79175947, 3.19722458, 3.19722458,
       3.19722458, 2.09861229, 1.94446161, 3.19722458, 2.79175947,
       3.19722458, 3.19722458, 3.19722458, 3.19722458])

In [72]:
tfidfvec.vocabulary_

{'whether': 71,
 'or': 46,
 'not': 42,
 'to': 69,
 'shuffle': 59,
 'the': 67,
 'data': 15,
 'might': 38,
 'be': 9,
 'important': 27,
 'for': 23,
 'models': 39,
 'that': 66,
 'make': 33,
 'assumption': 8,
 'samples': 58,
 'are': 6,
 'independent': 28,
 'string': 62,
 'it': 30,
 'is': 29,
 'passed': 49,
 '_check_stop_list': 0,
 'and': 3,
 'appropriate': 5,
 'stop': 61,
 'list': 32,
 'returned': 57,
 'english': 20,
 'currently': 14,
 'only': 45,
 'supported': 64,
 'value': 70,
 'may': 35,
 'contain': 13,
 'any': 4,
 'subset': 63,
 'of': 43,
 'headers': 26,
 'footers': 22,
 'quotes': 54,
 'each': 19,
 'these': 68,
 'kinds': 31,
 'text': 65,
 'will': 72,
 'detected': 16,
 'removed': 56,
 'from': 24,
 'newsgroup': 40,
 'posts': 51,
 'preventing': 53,
 'classifiers': 12,
 'overfitting': 48,
 'on': 44,
 'metadata': 36,
 'remove': 55,
 'accents': 1,
 'perform': 50,
 'other': 47,
 'character': 10,
 'normalization': 41,
 'during': 18,
 'preprocessing': 52,
 'step': 60,
 'ascii': 7,
 'fast': 21,
 

In [61]:
tfidfvec.get_feature_names

<bound method CountVectorizer.get_feature_names of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)>

In [64]:
print(f)

  (0, 71)	0.403819788795576
  (0, 46)	0.403819788795576
  (0, 42)	0.403819788795576
  (0, 69)	0.3526082362881726
  (0, 59)	0.403819788795576
  (0, 67)	0.2455917803739083
  (0, 15)	0.403819788795576
  (1, 38)	0.438812253457702
  (1, 9)	0.3831630322398257
  (1, 27)	0.438812253457702
  (1, 23)	0.438812253457702
  (1, 39)	0.438812253457702
  (1, 66)	0.2880300602129512
  (2, 67)	0.6151050960202912
  (2, 66)	0.33193432759520597
  (2, 33)	0.5057001695737242
  (2, 8)	0.5057001695737242
  (3, 58)	0.6016625996187502
  (3, 6)	0.5253610496030467
  (3, 28)	0.6016625996187502
  (4, 69)	0.3891838144044296
  (4, 62)	0.3891838144044296
  (4, 30)	0.4457074723773971
  (4, 29)	0.31797242712551704
  (4, 49)	0.4457074723773971
  :	:
  (12, 55)	0.4655743898402458
  (12, 1)	0.4655743898402458
  (12, 50)	0.4655743898402458
  (12, 47)	0.4655743898402458
  (13, 67)	0.2909322258322533
  (13, 10)	0.4783718323575006
  (13, 41)	0.4783718323575006
  (13, 18)	0.4783718323575006
  (13, 52)	0.4783718323575006
  (14, 66)

In [74]:
tfidfvec.get_feature_names()

['_check_stop_list',
 'accents',
 'an',
 'and',
 'any',
 'appropriate',
 'are',
 'ascii',
 'assumption',
 'be',
 'character',
 'characters',
 'classifiers',
 'contain',
 'currently',
 'data',
 'detected',
 'direct',
 'during',
 'each',
 'english',
 'fast',
 'footers',
 'for',
 'from',
 'have',
 'headers',
 'important',
 'independent',
 'is',
 'it',
 'kinds',
 'list',
 'make',
 'mapping',
 'may',
 'metadata',
 'method',
 'might',
 'models',
 'newsgroup',
 'normalization',
 'not',
 'of',
 'on',
 'only',
 'or',
 'other',
 'overfitting',
 'passed',
 'perform',
 'posts',
 'preprocessing',
 'preventing',
 'quotes',
 'remove',
 'removed',
 'returned',
 'samples',
 'shuffle',
 'step',
 'stop',
 'string',
 'subset',
 'supported',
 'text',
 'that',
 'the',
 'these',
 'to',
 'value',
 'whether',
 'will',
 'works']

In [76]:
print(tfidfvec.get_stop_words())

None


In [86]:
tfidf = TfidfVectorizer()

In [87]:
tfidf_fit = tfidf.fit_transform(df['news'])

In [89]:
dense = tfidf_fit.todense()

In [115]:
data = pd.DataFrame(
    data=dense,
)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101621,101622,101623,101624,101625,101626,101627,101628,101629,101630
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [95]:
rfc = RandomForestClassifier(n_jobs=-1, verbose=2)

In [98]:
rfc.fit(data.iloc[:1500, :], df.iloc[:1500, -1])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 10building tree 2 of 10building tree 3 of 10

building tree 4 of 10

building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.5s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=2,
                       warm_start=False)

In [99]:
rfc.fit(data, df.iloc[:, -1])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 2 of 10building tree 1 of 10building tree 3 of 10building tree 4 of 10



building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   33.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=2,
                       warm_start=False)

In [100]:
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [101]:
df_test = pd.DataFrame(
    columns=['news']
)

In [102]:
df_test['news'] = test_data['data']
df_test['target'] = test_data['target']

In [104]:
df_test.head()

Unnamed: 0,news,target
0,I am a little confused on all of the models of...,7
1,I'm not familiar at all with the format of the...,5
2,"\nIn a word, yes.\n",0
3,\nThey were attacking the Iraqis to drive them...,17
4,\nI've just spent two solid months arguing tha...,19


In [105]:
tfidf_test = tfidf.transform(df_test['news'])

In [106]:
test_dense = tfidf_test.todense()

In [109]:
data_test = pd.DataFrame(
    data=test_dense,
)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101621,101622,101623,101624,101625,101626,101627,101628,101629,101630
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.027904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
p = rfc.predict(data_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    7.5s finished


In [111]:
accuracy_score(p, df_test['target'])

0.4066648964418481

In [119]:
from gensim.parsing import preprocessing, remove_stopwords

In [120]:
FILTERS = [
    preprocessing.strip_numeric,
    preprocessing.strip_punctuation,
    preprocessing.strip_tags,
    preprocessing.strip_non_alphanum,
    preprocessing.strip_multiple_whitespaces,
    preprocessing.stem_text
]

In [121]:
def cleaning(string):
    word_list = preprocessing.preprocess_string(string, FILTERS)
    return ' '.join(word.lower() for word in word_list)

In [128]:
train = []
for i in train_data['data']:
    train.append(cleaning(i))

In [130]:
test = []
for i in test_data['data']:
    test.append(cleaning(i))

In [136]:
df['clean_data'] = train

In [138]:
df.head()

Unnamed: 0,news,target,clean_data
0,I was wondering if anyone out there could enli...,7,i wa wonder if anyon out there could enlighten...
1,A fair number of brave souls who upgraded thei...,4,a fair number of brave soul who upgrad their s...
2,"well folks, my mac plus finally gave up the gh...",4,well folk my mac plu final gave up the ghost t...
3,\nDo you have Weitek's address/phone number? ...,1,do you have weitek s address phone number i d ...
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,from articl cowcb np world std com by tombak w...


In [139]:
tfidf1 = TfidfVectorizer()

In [141]:
tfidf_fit1 = tfidf1.fit_transform(df['clean_data'])

In [142]:
dense1 = tfidf_fit1.todense()

In [143]:
data1 = pd.DataFrame(
    data=dense1,
)
data1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60113,60114,60115,60116,60117,60118,60119,60120,60121,60122
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
rfc2 = RandomForestClassifier(n_jobs=-1, verbose=2)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
rfc2.fit(data1, df.iloc[:, -1])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 3 of 100building tree 2 of 100building tree 4 of 100building tree 1 of 100



