In [55]:
import os
import nltk
import math
import random
import collections

import itertools
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

In [56]:
parent = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
parent = parent.split('/')
parent.remove(parent[-1])
parent = '/'.join(parent)
categories = ['entertainment', 'sports', 'fun', 'games', 'weather', 'science', 'technology', 'politics']

In [57]:
def fileids(category):
    path = os.path.join(parent, 'corpus', 'processed', category)
    return os.listdir(path)

def words(file):
    f = open(file, 'r', encoding='ISO-8859-1').read().strip()
    sents = [sent.split(' ') for sent in f.split('\n')]
    words = [word for sent in sents for word in sent if len(word) > 0]
    return words

In [58]:
cfd = nltk.ConditionalFreqDist()

documents, total_words = [], []

for category in categories:
    
    for fileid in fileids(category):
        if fileid == '.DS_Store':
            continue
        path = os.path.join(parent, 'corpus', 'processed',
                            category, fileid)
        
        # for each category, increment the amount of times
        # a single word appears in it
        fileid_words = words(path)
        
        for word in fileid_words:
            cfd[category][word] += 1 
        
        documents.append((fileid_words, category))
        total_words.extend(fileid_words)

fd = nltk.FreqDist([w.lower() for w in total_words])
    
print('Word count:', len(total_words))
#random.shuffle(documents)

Word count: 4110411


In [59]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

for condition in cfd.conditions():
    print(condition)
    features = sorted(cfd[condition], key=lambda x: cfd[condition][x], reverse=True)
    sig_features = [x for x in features if x not in stop]
    
    top_50 = sig_features[:10]
    for item in top_50:
        count = cfd[condition][item]
        print('{} {}'.format(item, count))
    print()

entertainment
new 3085
video 2165
#oscars 1603
says 1436
star 1427
watch 1371
best 1302
trump 1260
first 1165
see 1043

games
new 3083
available 2007
game 1926
2 1696
steam 1679
games 1546
get 1492
#steamnewrelease 1455
ps4 1342
one 1339

weather
rain 4539
tornado 3636
pm 2888
weather 2851
snow 2709
day 2704
cst 2668
forecast 2398
today 2389

science
new 3761
science 2390
scientists 1982
via 1862
could 1669
may 1367
us 1277
space 1255
first 1157
one 1047

sports
game 1892
win 1418
new 1201
team 1171
vs 1129
via 1105
nfl 1097
one 1081
first 1076
season 1070

politics
trump 9123
says 3440
trump's 2789
brexit 2277
president 2122
new 2092
donald 1993
house 1975
#hw 1680
may 1616

technology
new 2941
#cdntech 2448
data 1827
google 1722
apple 1625
tech 1450
facebook 1266
app 1126
says 908
big 849

fun
like 2683
i'm 2225
it's 2120
people 1905
green 1905
don't 1749
get 1603
one 1567
blue 1315
u 1224



In [17]:
bigram_finder = BigramCollocationFinder.from_words(total_words)
bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
d = {bigram: True for bigram in bigrams}

In [18]:
list(d.keys())[:10]

[('tia', 'mowry'),
 ('morning.northern', '#ny'),
 ('2(alice',
  'experiment)-&gt;3-&gt;4-&gt;5(cms)-&gt;6-&gt;7-&gt;8(lhcb)-&gt;point1'),
 ("@andersoncooper's", 'ridiculist'),
 ('no.12', '@uvamenshoops'),
 ('pretentiously', "hawai'i"),
 ('jconnelly', 'nportman'),
 ('on-again', 'off-again'),
 ('jyaysi', 'desai'),
 ('tedxamsterdam', 'schiphol')]

In [5]:
all_words = set(nltk.bigrams(total_words))
word_features = [' '.join(x) for x in list(all_words)]
#all_words = set(total_words)
#word_features = list(all_words)
random.shuffle(word_features)
word_features = word_features[:2000]
print('Feature size:', len(all_words))

Feature size: 1617552


In [6]:
def features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def bigram_features(words, score_fn=BigramAssocMeasures.chi_sq, n=2000):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(' '.join(ngram), True) for ngram in itertools.chain(words, bigrams)])

In [13]:
d = [documents[1], documents[2]]
t = [(bigram_features(d), c) for (d, c) in d]

In [30]:
print(len(t[0][0]), len(t[1][0]))
print(len(documents[1][0]), len(documents[2][0]))

print(list(t[1][0].keys())[:50])
print(t[1][0]['raddatz cooper'])

8344 10281
29290 30350
['raddatz cooper', 'holocaust survivor', 's h o w i n g', 't h i n g', '@ e a t t h e w o r l d t v', 'u n d e n i a b l e', 'gretchen carlson', 'a l l e g e d', 'a n d e r s o n', 'n o v e l i s t s', "cookie's wardrobe", '# s o m e t h i n g r o t t e n', 't a l e s', 't h e a t r i c a l', '# t h e n i g h t o f', 'h o l d s', 'g o s p e l', 'u n u s u a l', 'g r e a t', '# b i r b i g l i a', 'b u i l d s', 'v o t e', 'e m b a r r a s s e d', 's l a y', '# r o n g o l d m a n', '# l i n - m a n u e l', '@ j l o', '# m a n b o o k e r p r i z e', 'l i v e - a c t i o n', 'i n v o l v e', 'cry @5xpaz', 'r e c o v e r y', '8 . 8 m', 'f i f t y', 't w o', '# c u r i o u s g e o r g e', 'w e a p o n', 'f a t a l', 'fiennes richard', '# m a c k l e m o r e', '# n a s', 'r u p t u r e', 'r i s i n g', '# z e n d a y a', '@susansarandon @mrdannyglover', '@ j o h n c e n a', 'c d', '@5xpaz #nickcannon', 'l i k e s', 'n a w l i n s']
True


In [129]:
feature_sets = [(bigram_features(d), c) for (d, c) in documents]
random.shuffle(feature_sets)
print('Documents:', len(feature_sets))
cutoff = math.ceil(len(feature_sets) * 0.7)
train_set, test_set = feature_sets[:cutoff], feature_sets[cutoff:]

Documents: 163


In [130]:
print('Training Set')
training_counts = [y for (x, y) in train_set]
for k, v in collections.Counter(training_counts).items():
    print(k, v)
print()
print('Test Set')
test_counts = [y for (x, y) in test_set]
for k, v in collections.Counter(test_counts).items():
    print(k, v)

Training Set
entertainment 17
games 13
science 12
fun 16
sports 13
weather 13
politics 16
technology 15

Test Set
technology 5
politics 4
games 8
science 8
fun 5
weather 6
entertainment 4
sports 8


In [131]:
def display(num):
    return '{0:.2f}'.format(num)

In [132]:
from nltk.classify import accuracy
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.linear_model import LogisticRegression

classifier = SklearnClassifier(MultinomialNB())
classifier.train(train_set)
acc = accuracy(classifier, test_set)
print('Multinomial NB: {}%'.format(display(acc * 100)))

classifier = SklearnClassifier(NuSVC())
classifier.train(train_set)
acc = accuracy(classifier, test_set)
print('NuSVC: {}%'.format(display(acc * 100)))

classifier = SklearnClassifier(LogisticRegression())
classifier.train(train_set)
print('LogisticRegression: {}%'.format(display(acc * 100)))

Multinomial NB: 81.25%
NuSVC: 93.75%
LogisticRegression: 93.75%


In [133]:
probs = classifier.prob_classify(features("take a look at the latest wind chills mostly teens in the city don't forget the hat scarf and gloves today".split(' ')))
print('max:', probs.max())
for sample in probs.samples():
    print(sample, probs.prob(sample))

max: sports
science 0.120813609397
weather 0.121269114468
politics 0.120959745752
entertainment 0.13126598441
games 0.121839852946
technology 0.120982251026
sports 0.13149354798
fun 0.131375894022
