In [1]:
import spacy
from sklearn.base import TransformerMixin

# Create a spaCy parser
nlp = spacy.load('en')


class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document)):
                if len(word.text.strip()):
                    row[word.text] = True
                    #print (row)
            results.append(row)
        
        
        return results

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
from sklearn.naive_bayes import BernoulliNB

In [4]:
import os
input_filename = os.path.join(os.path.expanduser("~"), "Documents\Masters\Spring2019\Data Mining\Assignments\Week7", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Documents\Masters\Spring2019\Data Mining\Assignments\Week7", "python_classes.json")

In [5]:
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])
        
#print(tweets)

with open(labels_filename) as inf:
    labels = json.load(inf)

# Ensure only classified tweets are loaded
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

In [6]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

In [7]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, tweets, labels, scoring='f1' , cv=3 , error_score= 'raise')
#We then print out the average of the scores:
import numpy as np
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.778


In [8]:
model = pipeline.fit(tweets, labels)

In [9]:
nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

In [10]:
top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

In [11]:
dv = model.named_steps['vectorizer']

In [12]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

0 … 0.7377049180327867
1 : 0.7049180327868851
2 RT 0.6229508196721311
3 # 0.5737704918032785
4 Python 0.540983606557377
5 a 0.3442622950819672
6 to 0.3278688524590163
7 python 0.3278688524590163
8 . 0.3278688524590163
9 and 0.2786885245901639
10 , 0.2786885245901639
11 with 0.24590163934426226
12 in 0.24590163934426226
13 the 0.21311475409836064
14 - 0.18032786885245902
15 you 0.1475409836065574
16 I 0.1475409836065574
17 that 0.1475409836065574
18 django 0.1475409836065574
19 of 0.1475409836065574
20 ? 0.1311475409836065
21 your 0.1311475409836065
22 by 0.11475409836065574
23 out 0.11475409836065574
24 for 0.11475409836065574
25 Django 0.11475409836065574
26 is 0.11475409836065574
27 AI 0.0983606557377049
28 n't 0.0983606557377049
29 it 0.0983606557377049
30 MachineLearning 0.0983606557377049
31 are 0.0983606557377049
32 If 0.0983606557377049
33 ! 0.0983606557377049
34 Authentication 0.0819672131147541
35 @JordanIrabor 0.0819672131147541
36 dash 0.0819672131147541
37 just 0.0819672131

In [13]:
import json
import os
from sklearn.externals import joblib
output_filename = os.path.join(os.path.expanduser("~"), "Documents\Masters\Spring2019\Data Mining\Assignments\Week7", "python_context.pkl")
joblib.dump(model,output_filename)


['C:\\Users\\sheth\\Documents\\Masters\\Spring2019\\Data Mining\\Assignments\\Week7\\python_context.pkl']