In [1]:
import os
import nltk
import math
import random
import itertools
import collections
from nltk.classify import accuracy
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import NuSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
PARENT = '/Users/shreydesai/GitHub/niche'
CATEGORIES = ['entertainment', 'sports', 'fun', 'games', 
              'weather', 'science', 'technology', 'politics']

In [3]:
def fileids(category):
    """Get file IDs of tweets for specified category"""
    path = os.path.join(PARENT, 'corpus', 'processed', category)
    return os.listdir(path)

def sents(file):
    """Get list of sentences from a document"""
    f = open(file, 'r', encoding='ISO-8859-1').read().strip()
    return [sent.strip() for sent in f.split('\n')]

def words(file):
    """Get list of words from a document"""
    f = open(file, 'r', encoding='ISO-8859-1').read().strip()
    sents = [sent.split(' ') for sent in f.split('\n')]
    words = [word for sent in sents for word in sent if len(word) > 0]
    return words

In [7]:
documents, total_words = [], []

for category in CATEGORIES:
    for fileid in fileids(category):
        if fileid == '.DS_Store':
            continue
        path = os.path.join(PARENT, 'corpus', 'processed',
                            category, fileid)
        w = words(path)
        documents.append((w, category))
        total_words.extend(w)

print('Word count:', len(total_words))
print(total_words[:10])

Word count: 4110411
['meghan', "markle's", 'acting', 'past', 'comes', 'to', 'light', 'see', 'her', 'raunchy']


In [10]:
documents[0]

(['meghan',
  "markle's",
  'acting',
  'past',
  'comes',
  'to',
  'light',
  'see',
  'her',
  'raunchy',
  '90210',
  'scene',
  'zendaya',
  'talks',
  'about',
  'her',
  'growing',
  'fashion',
  'empire',
  'and',
  'the',
  'importance',
  'of',
  'inclusivity',
  'this',
  'is',
  'us',
  'milo',
  'ventimiglia',
  'mandy',
  'moore',
  'on',
  'that',
  'finale',
  'cliffhanger',
  'beauty',
  'and',
  'the',
  'beast',
  'becomes',
  "fandango's",
  'top',
  'family',
  'pre-seller',
  'angelina',
  'jolie',
  'gives',
  'her',
  'first',
  'college',
  'lecture',
  'discusses',
  'refugee',
  "women's",
  'rights',
  'gabourey',
  'sidibe',
  'says',
  "she's",
  'read',
  'scripts',
  'where',
  'her',
  'character',
  'was',
  'literally',
  'referred',
  'to',
  'as',
  'hippo',
  'dwyane',
  'wade',
  'drops',
  'brand',
  'new',
  'workout',
  'apparel',
  'line',
  'mission',
  'bachelor',
  'director',
  'goes',
  'behind',
  'the',
  'scenes',
  'of',
  "nick's",
 

In [8]:
def features(document):
    """Builds list of features for each document"""
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

vocab = set(total_words)
word_features = list(vocab)
random.shuffle(word_features)
word_features = word_features[:2000]
print(word_features[:10])

feature_sets = [(features(d), c) for (d, c) in documents]
random.shuffle(feature_sets)
cutoff = math.ceil(len(feature_sets) * 0.7)
train_set, test_set = feature_sets[:cutoff], feature_sets[cutoff:]

['game-ending', 'juror', 'deficiency', 'arrival-like', 'bruges', "effect's", 'isis-style', 'snow/cold', 'stargazer', 'mandemant']


In [6]:
def display(num):
    return '{0:.2f}'.format(num)

def run_classifier(name, classifier, train_set, test_set):
    clf = SklearnClassifier(classifier)
    clf.train(train_set)
    
    # general classifier accuracy
    acc = accuracy(clf, test_set)
    print('{}: {}%'.format(name, display(acc * 100)))
    
    # confusion matrix
    gold = clf.classify_many([fs for (fs, l) in test_set])
    test = [l for (fs, l) in test_set]
    cm = nltk.ConfusionMatrix(gold, test)
    print(cm.pretty_format(show_percents=True))

# Naive Bayes
run_classifier('Multinomial NB', MultinomialNB(), train_set, test_set)
print()

# SVM
run_classifier('NuSVC', NuSVC(), train_set, test_set)
print()

# Logistic Regression
run_classifier('LogisticRegression', LogisticRegression(), train_set, test_set)

Multinomial NB: 82.98%
              |      e                                                  |
              |      n                                                  |
              |      t                                                  |
              |      e                                         t        |
              |      r                                         e        |
              |      t                    p                    c        |
              |      a                    o      s             h      w |
              |      i                    l      c      s      n      e |
              |      n             g      i      i      p      o      a |
              |      m             a      t      e      o      l      t |
              |      e      f      m      i      n      r      o      h |
              |      n      u      e      c      c      t      g      e |
              |      t      n      s      s      e      s      y      r |
--------------+