In [2]:
# https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

## 1. Load Data

In [3]:
from sklearn.datasets import fetch_20newsgroups
data_train = fetch_20newsgroups(subset='train', shuffle=True)

In [4]:
data_train.target_names, len(data_train.target_names)

(['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'],
 20)

In [5]:
data_train.data[0].split('\n')[:9]

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu',
 'Organization: University of Maryland, College Park',
 'Lines: 15',
 '',
 ' I was wondering if anyone out there could enlighten me on this car I saw',
 'the other day. It was a 2-door sports car, looked to be from the late 60s/',
 'early 70s. It was called a Bricklin. The doors were really small. In addition,']

## extract features from text

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()

In [7]:
# learn the words, dictionary
X_train_counts = count_vec.fit_transform(data_train.data)
X_train_counts.shape

(11314, 130107)

In [8]:
# less frequent words more interesting
from sklearn.feature_extraction.text import TfidfTransformer
tfid_tran = TfidfTransformer()

In [9]:
X_train_tfidf = tfid_tran.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [10]:
X_train_tfidf[0]

<1x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 89 stored elements in Compressed Sparse Row format>

In [11]:
## algos

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, data_train.target)

In [13]:
clf

MultinomialNB()

In [14]:
## pipeline

In [15]:
from sklearn.pipeline import Pipeline

In [16]:
text_clf = Pipeline([('vect',CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [17]:
text_clf = text_clf.fit(data_train.data, data_train.target)

In [18]:
text_clf

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [19]:
## performance?

In [20]:
import numpy as np


In [21]:
data_test = fetch_20newsgroups(subset='test', shuffle=True)
pred = text_clf.predict(data_test.data)
np.mean(pred == data_test.target)

0.7738980350504514

In [22]:
## 77% accuracy

In [23]:
data_test.data[0]

'From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\nSubject: Need info on 88-89 Bonneville\nOrganization: University at Buffalo\nLines: 10\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.\n\n\t\t\tNeil Gandler\n'

In [24]:
sample = '''A printed circuit board (PCB) mechanically supports and electrically connects electronic components using conductive tracks, pads, and other features etched from copper sheets laminated onto a non-conductive substrate. A printed circuit board has pre-designed copper tracks on a conducting sheet. The pre-defined tracks reduce the wiring, thereby reducing the faults arising due to loose connections. One simply needs to place the components on the PCB and solder them.

In this tutorial, I will show you how you can make circuit boards at home easily. Doing so will save you a lot of time from debugging and double checking the connections on a breadboard. You can even make your own Arduino after going through this DIY Arduino tutorial a'''

In [25]:
res = text_clf.predict([sample])
probs = text_clf.predict_proba([sample])

In [26]:
res, np.max(res), probs

(array([12]),
 12,
 array([[0.0252967 , 0.02998996, 0.03430563, 0.0610039 , 0.05401969,
         0.02958355, 0.0238166 , 0.04432755, 0.03155311, 0.02527666,
         0.03001462, 0.07614875, 0.28861128, 0.03715695, 0.03669201,
         0.05035022, 0.04283766, 0.03444081, 0.03094244, 0.01363192]]))

In [27]:
for i,y in enumerate(data_train.target_names):
    if i==res[0]:
        print(i,y)
    


12 sci.electronics


In [28]:
for i,p in enumerate(probs[0]):
    name = data_train.target_names[i]
    print(f'{i:2}  {p:2.2f} {name}')

 0  0.03 alt.atheism
 1  0.03 comp.graphics
 2  0.03 comp.os.ms-windows.misc
 3  0.06 comp.sys.ibm.pc.hardware
 4  0.05 comp.sys.mac.hardware
 5  0.03 comp.windows.x
 6  0.02 misc.forsale
 7  0.04 rec.autos
 8  0.03 rec.motorcycles
 9  0.03 rec.sport.baseball
10  0.03 rec.sport.hockey
11  0.08 sci.crypt
12  0.29 sci.electronics
13  0.04 sci.med
14  0.04 sci.space
15  0.05 soc.religion.christian
16  0.04 talk.politics.guns
17  0.03 talk.politics.mideast
18  0.03 talk.politics.misc
19  0.01 talk.religion.misc


In [29]:
## tune

In [30]:
from sklearn.model_selection import GridSearchCV
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'tfidf__use_idf': (True, False),
        'clf__alpha': (1e-2, 1e-3, 1e-4),
}

In [31]:
gs_clf = GridSearchCV(text_clf, params, n_jobs=-1)
gs_clf = gs_clf.fit(data_train.data, data_train.target)

In [32]:
gs_clf.best_score_
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [33]:
preds_tune = gs_clf.predict(data_test.data)

In [35]:
pred = gs_clf.predict(data_test.data)
print(np.mean(pred == data_test.target))

0.8361656930430165


In [None]:
# 83% good enough