In [2]:
import os
import sklearn.datasets
phi_train = sklearn.datasets.load_files('datasets/phi_train/')
# Name of categories
phi_train.target_names

['authoritarian', 'democratic', 'utilitarianism', 'virtue']

In [3]:
# Tokenization
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(decode_error=u'ignore')
X_train_counts = count_vect.fit_transform(phi_train.data)
X_train_counts.shape

(840, 27724)

In [4]:
# Features
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(840, 27724)

In [5]:
# Training a classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, phi_train.target)

In [6]:
# Pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])
text_clf = text_clf.fit(phi_train.data, phi_train.target)

In [7]:
# Evaluate: Predict accuracy
import numpy as np
phi_test = sklearn.datasets.load_files('datasets/phi_test/')
predicted = text_clf.predict(phi_test.data)
np.mean(predicted == phi_test.target)

0.79333333333333333

In [8]:
# SVM
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42))
                     ])
text_clf = text_clf.fit(phi_train.data, phi_train.target)
predicted = text_clf.predict(phi_test.data)
np.mean(predicted == phi_test.target)

0.84666666666666668

In [9]:
from sklearn import metrics
print(metrics.classification_report(phi_test.target, predicted, 
                                    target_names=phi_test.target_names))
metrics.confusion_matrix(phi_test.target, predicted)

                precision    recall  f1-score   support

 authoritarian       0.57      0.93      0.71        30
    democratic       1.00      0.67      0.80        30
utilitarianism       1.00      0.82      0.90        60
        virtue       0.94      1.00      0.97        30

   avg / total       0.90      0.85      0.85       150



array([[28,  0,  0,  2],
       [10, 20,  0,  0],
       [11,  0, 49,  0],
       [ 0,  0,  0, 30]])

In [10]:
# Testing test sets
result = {'utilitarianism' : 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
for instance, category in zip(phi_test.data, predicted):
    result[phi_test.target_names[category]] += 1
print result

{'utilitarianism': 49, 'democratic': 20, 'authoritarian': 49, 'virtue': 32}


In [11]:
# Testing bible
path = 'datasets/bible/'
root = '/Users/Sheon/nlp/datasets/bible/'
bible = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    bible.append(f.read())
predicted = text_clf.predict(bible)
result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(bible, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print "Number of instances in " + filename[:-3] + " " + str(count)
print "Result of " + filename[:-3]
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

Number of instances in bible 677
Result of bible
('utilitarianism', 2.2156573116691285)
('democratic', 0.0)
('authoritarian', 74.44608567208272)
('virtue', 23.338257016248154)


In [12]:
# testing Obama
path = 'datasets/obama/'
root = '/Users/Sheon/nlp/datasets/obama/'
obama = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    obama.append(f.read())
print len(obama)
predicted = text_clf.predict(obama)
# Count

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(obama, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Obama"
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

59
{'utilitarianism': 2, 'democratic': 0, 'authoritarian': 44, 'virtue': 13}
Obama
('utilitarianism', 3.389830508474576)
('democratic', 0.0)
('authoritarian', 74.57627118644068)
('virtue', 22.033898305084744)


In [13]:
# testing Bush
path = 'datasets/bush/'
root = '/Users/Sheon/nlp/datasets/bush/'
bush = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    bush.append(f.read())
print len(bush)
predicted = text_clf.predict(bush)

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(bush, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Bush"
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

47
{'utilitarianism': 1, 'democratic': 0, 'authoritarian': 28, 'virtue': 18}
Bush
Count: 47
('utilitarianism', 2.127659574468085)
('democratic', 0.0)
('authoritarian', 59.57446808510638)
('virtue', 38.297872340425535)


In [14]:
# testing Wolf of Wall Street
path = 'datasets/ws/'
root = '/Users/Sheon/nlp/datasets/ws/'
wolf = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    wolf.append(f.read())
predicted = text_clf.predict(wolf)

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(wolf, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'utilitarianism': 281, 'democratic': 0, 'authoritarian': 1, 'virtue': 12}
Count: 294
('utilitarianism', 95.578231292517)
('democratic', 0.0)
('authoritarian', 0.3401360544217687)
('virtue', 4.081632653061225)


In [15]:
# testing democrats nomination
path = 'input/nomination_democrats/'
root = '/Users/Sheon/nlp/input/nomination_democrats/'
democrats_nomination = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    democrats_nomination.append(f.read())
predicted = text_clf.predict(democrats_nomination)

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(democrats_nomination, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'utilitarianism': 1, 'democratic': 1, 'authoritarian': 52, 'virtue': 39}
Count: 93
('utilitarianism', 1.0752688172043012)
('democratic', 1.0752688172043012)
('authoritarian', 55.91397849462365)
('virtue', 41.935483870967744)


In [16]:
# testing republican nomination
path = 'input/nomination_republican//'
root = '/Users/Sheon/nlp/input/nomination_republican/'
republican_nomination = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    republican_nomination.append(f.read())
predicted = text_clf.predict(republican_nomination)

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(republican_nomination, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'utilitarianism': 0, 'democratic': 6, 'authoritarian': 68, 'virtue': 47}
Count: 121
('utilitarianism', 0.0)
('democratic', 4.958677685950414)
('authoritarian', 56.19834710743802)
('virtue', 38.84297520661157)


In [17]:
# testing trump nomination
path = 'input/trump_speeches/'
root = '/Users/Sheon/nlp/input/trump_speeches/'
trump_speeches = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    trump_speeches.append(f.read())
predicted = text_clf.predict(trump_speeches)

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(trump_speeches, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'utilitarianism': 18, 'democratic': 0, 'authoritarian': 53, 'virtue': 30}
Count: 101
('utilitarianism', 17.82178217821782)
('democratic', 0.0)
('authoritarian', 52.475247524752476)
('virtue', 29.7029702970297)


In [21]:
# testing hillary nomination
path = 'input/hillary/'
root = '/Users/Sheon/nlp/input/hillary/'
hillary = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    hillary.append(f.read())
predicted = text_clf.predict(hillary)

result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0,
         'virtue': 0}
count = 0
for instance, category in zip(hillary, predicted):
    result[phi_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

<type 'list'>
101
{'utilitarianism': 4, 'democratic': 0, 'authoritarian': 30, 'virtue': 67}
Count: 101
('utilitarianism', 3.9603960396039604)
('democratic', 0.0)
('authoritarian', 29.7029702970297)
('virtue', 66.33663366336634)


In [73]:
# testing democrats nomination
path = '/Users/Sheon/nlp/input/presidents/'
target = open('/Users/Sheon/nlp/input/presidents/sou_result.json', 'w')

for dirname in os.listdir(path):
    if (dirname == ".DS_Store" or dirname == "sou_result.json"): continue 
    pathPres = path + dirname
    president = []
    for filename in os.listdir(pathPres):
        f = open(os.path.join(pathPres, filename), 'r')
        president.append(f.read())
    predicted = text_clf.predict(president)
    
    result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0, 'virtue': 0}
    count = 0
    for instance, category in zip(president, predicted):
        result[phi_train.target_names[category]] += 1
        count += 1
    print str(dirname)
    # print result
    # print "Count: " + str(count)
    for key, value in result.items():
        result[key] = value / float(count) * 100
        #print (key, result[key])
    target.write(str(result))
    target.write(', ')
    # print result


Carter
Clinton
Coolidge
Eisenhower
FDR
Ford
Harding
Hbush
Hoover
Johnson
Kennedy
Nixon
Obama
Reagan
Taft
TDR
Truman
Wbush
WWS


In [77]:
# testing democrats nomination
path = '/Users/Sheon/nlp/input/old_presidents//'
target = open('/Users/Sheon/nlp/input/old_presidents/sou_result.json', 'w')

for dirname in os.listdir(path):
    if (dirname == ".DS_Store" or dirname == "sou_result.json"): continue 
    pathPres = path + dirname
    president = []
    for filename in os.listdir(pathPres):
        f = open(os.path.join(pathPres, filename), 'r')
        president.append(f.read())
    predicted = text_clf.predict(president)
    
    result = {'utilitarianism': 0, 'authoritarian': 0, 'democratic': 0, 'virtue': 0}
    count = 0
    for instance, category in zip(president, predicted):
        result[phi_train.target_names[category]] += 1
        count += 1
    print str(dirname)
    # print result
    # print "Count: " + str(count)
    for key, value in result.items():
        result[key] = value / float(count) * 100
        #print (key, result[key])
    target.write(str(result))
    target.write(', ')
    # print result


Adams
J.Q.Adams
Jackson
Jefferson
Madison
Monroe


OSError: [Errno 20] Not a directory: '/Users/Sheon/nlp/input/old_presidents//sou_rel_result.json'

In [76]:
target.write('hi')