In [1]:
# Program used for performing the classification of data
import spacy
from sklearn.base import TransformerMixin
from nltk import word_tokenize

# Create a spaCy parser
# nlp = spacy.load('en_core_web_sm')

class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [{word: True for word in word_tokenize(document)}
                 for document in X]

In [2]:
# Importing DictVectorizer to convert the dictionaries into a matrix
from sklearn.feature_extraction import DictVectorizer

In [3]:
# Importing BernoulliNB classifier to use in our dataset
from sklearn.naive_bayes import BernoulliNB

In [4]:
import os

input_filename = os.path.join(os.path.expanduser('~'), 'OneDrive', 'Desktop', 'Pace', 'CS619', 'Chapter06', 'Data', 'python_tweets.json')
labels_filename = os.path.join(os.path.expanduser('~'), 'OneDrive', 'Desktop', 'Pace', 'CS619', 'Chapter06', 'Data', 'python_classes.json')

In [5]:
# Loading the tweets
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: 
            continue
        tweets.append(json.loads(line)['text'])
print(f'Loaded {len(tweets)} tweets')

with open(labels_filename) as inf:
    labels = json.load(inf)
    
# Ensure only classified tweets are loaded
# tweets = tweets[:len(labels)]
# assert len(tweets) == len(labels)

Loaded 100 tweets


In [6]:
n_samples = min(len(tweets), len(labels))

In [7]:
sample_tweets = [t.lower() for t in tweets[:n_samples]]
labels = labels[:n_samples]

In [8]:
import numpy as np
y_true = np.array(labels)

In [9]:
print(f'{np.mean(y_true == 1) * 100:.1f}% have class 1')

52.0% have class 1


In [10]:
# Creating a pipeline with all components together
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB())])

In [11]:
# Applying F1-score method to the database
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer

scores = cross_val_score(pipeline, sample_tweets, y_true, cv=10, scoring='f1')

# We then print out the average of the scores:
print(f'Score: {np.mean(scores):.3f}')

Score: 0.843


In [12]:
# Creating a new model
model = pipeline.fit(sample_tweets, labels)
model

Pipeline(memory=None,
         steps=[('bag-of-words',
                 <__main__.BagOfWords object at 0x0000020561A59880>),
                ('vectorizer',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('naive-bayes',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

In [13]:
# Getting Naive Bayes model
nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

In [14]:
# Sorting the array of algorithm probabilities
top_features = np.argsort(-nb.feature_log_prob_[1][:1000])

In [15]:
# Extracting features from pipeline
dv = model.named_steps['vectorizer']

In [16]:
# Printing the names of the top features
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

0 : 0.9444444444444443
1 python 0.8703703703703701
2 https 0.7777777777777778
3 # 0.6481481481481479
4 @ 0.6481481481481479
5 rt 0.48148148148148145
6 | 0.40740740740740744
7 automated 0.40740740740740744
8 a 0.3703703703703703
9 , 0.35185185185185175
10 . 0.27777777777777773
11 to 0.2592592592592592
12 and 0.2407407407407407
13 hasdid 0.2222222222222222
14 for 0.2222222222222222
15 with 0.2222222222222222
16 the 0.1851851851851852
17 i 0.16666666666666669
18 in 0.16666666666666669
19 is 0.1481481481481481
20 learning 0.1481481481481481
21 this 0.12962962962962962
22 of 0.12962962962962962
23 it 0.12962962962962962
24 can 0.11111111111111109
25 using 0.11111111111111109
26 you 0.11111111111111109
27 on 0.11111111111111109
28 have 0.11111111111111109
29 about 0.11111111111111109
30 your 0.09259259259259259
31 deep 0.09259259259259259
32 performance 0.09259259259259259
33 've 0.09259259259259259
34 learn 0.09259259259259259
35 activation 0.07407407407407407
36 visualize 0.074074074074074

633 detour 0.018518518518518517
634 leads 0.018518518518518517
635 //… 0.018518518518518517
636 //t… 0.018518518518518517
637 //t.co/xd0tybjrof 0.018518518518518517
638 goal 0.018518518518518517
639 usgslandsat 0.018518518518518517
640 nada 0.018518518518518517
641 2017 0.018518518518518517
642 are 0.018518518518518517
643 😊👍 0.018518518518518517
644 notepad+… 0.018518518518518517
645 stone 0.018518518518518517
646 store 0.018518518518518517
647 aoixx201 0.018518518518518517
648 anything 0.018518518518518517
649 ff 0.018518518518518517
650 files.… 0.018518518518518517
651 process 0.018518518518518517
652 proceduralart 0.018518518518518517
653 americafirst150 0.018518518518518517
654 tech 0.018518518518518517
655 price 0.018518518518518517
656 they 0.018518518518518517
657 fire 0.018518518518518517
658 following 0.018518518518518517
659 doj… 0.018518518518518517
660 alburov 0.018518518518518517
661 texas 0.018518518518518517
662 l… 0.018518518518518517
663 against 0.018518518518518517
6

In [17]:
# Exporting model file to use in Chapter 07 assignment
import joblib
output_filename = os.path.join(os.path.expanduser("~"), 'OneDrive', 'Desktop', 'Pace', 'CS619', 'Chapter07', 'Data', 'python_context.pkl')
joblib.dump(model, output_filename)

['C:\\Users\\rmart\\OneDrive\\Desktop\\Pace\\CS619\\Chapter07\\Data\\python_context.pkl']