In [17]:
import os
import sklearn.datasets
rel_train = sklearn.datasets.load_files('datasets/rel_train/')
# Name of categories
rel_train.target_names

['buddhism', 'christianity', 'hinduism', 'islam', 'non-religious']

In [18]:
# Tokenization
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(decode_error=u'ignore')
X_train_counts = count_vect.fit_transform(rel_train.data)
X_train_counts.shape

(1120, 35327)

In [19]:
# Features
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1120, 35327)

In [20]:
# Training a classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, rel_train.target)

In [21]:
# Pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])
text_clf = text_clf.fit(rel_train.data, rel_train.target)

In [22]:
# Evaluate: Predict accuracy
import numpy as np
rel_test = sklearn.datasets.load_files('datasets/rel_test/')
predicted = text_clf.predict(rel_test.data)
np.mean(predicted == rel_test.target)

0.625

In [23]:
# SVM
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42))
                     ])
text_clf = text_clf.fit(rel_train.data, rel_train.target)
predicted = text_clf.predict(rel_test.data)
np.mean(predicted == rel_test.target)

0.78125

In [24]:
from sklearn import metrics
print(metrics.classification_report(rel_test.target, predicted, 
                                    target_names=rel_test.target_names))
metrics.confusion_matrix(rel_test.target, predicted)

               precision    recall  f1-score   support

     buddhism       0.84      0.81      0.83        32
 christianity       0.53      0.94      0.67        32
     hinduism       1.00      0.34      0.51        32
        islam       0.97      0.97      0.97        32
non-religious       0.93      0.84      0.89        32

  avg / total       0.85      0.78      0.77       160



array([[26,  6,  0,  0,  0],
       [ 0, 30,  0,  0,  2],
       [ 5, 15, 11,  1,  0],
       [ 0,  1,  0, 31,  0],
       [ 0,  5,  0,  0, 27]])

In [25]:
# Testing test sets
result = {'buddhism': 0, 'christianity': 0,
         'hinduism': 0, 'islam': 0, 'non-religious': 0}
for instance, category in zip(rel_train.data, predicted):
    result[rel_train.target_names[category]] += 1
print result

{'hinduism': 11, 'buddhism': 31, 'islam': 32, 'non-religious': 29, 'christianity': 57}


In [26]:
# Testing bible
path = 'datasets/bible/'
root = '/Users/Sheon/nlp/datasets/bible/'
bible = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    bible.append(f.read())
predicted = text_clf.predict(bible)
result = {'buddhism': 0, 'christianity': 0,
         'hinduism': 0, 'islam': 0, 'non-religious': 0}
count = 0
for instance, category in zip(bible, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print "Number of instances in " + filename[:-3] + " " + str(count)
print "Result of " + filename[:-3]
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

Number of instances in bible 677
Result of bible
('hinduism', 0.0)
('buddhism', 4.726735598227474)
('islam', 4.726735598227474)
('non-religious', 3.3973412112259975)
('christianity', 87.14918759231905)


In [27]:
# Testing democrats
path = 'input/nomination_democrats/'
root = '/Users/Sheon/nlp/input/nomination_democrats/'
democrats_nomination = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    democrats_nomination.append(f.read())
predicted = text_clf.predict(democrats_nomination)
result = {'buddhism': 0, 'christianity': 0,
         'hinduism': 0, 'islam': 0, 'non-religious': 0}
count = 0
for instance, category in zip(democrats_nomination, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print "Number of instances in " + filename[:-3] + " " + str(count)
print "Result of " + filename[:-3]
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

Number of instances in democrats_nomination_speeches 93
Result of democrats_nomination_speeches
('hinduism', 0.0)
('buddhism', 2.1505376344086025)
('islam', 12.903225806451612)
('non-religious', 40.86021505376344)
('christianity', 44.086021505376344)


In [28]:
# Testing republicans
path = 'input/nomination_republican//'
root = '/Users/Sheon/nlp/input/nomination_republican/'
republican_nomination = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    republican_nomination.append(f.read())
predicted = text_clf.predict(republican_nomination)
result = {'buddhism': 0, 'christianity': 0,
         'hinduism': 0, 'islam': 0, 'non-religious': 0}
count = 0
for instance, category in zip(republican_nomination, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print "Number of instances in " + filename[:-3] + " " + str(count)
print "Result of " + filename[:-3]
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

Number of instances in nomi_repub 121
Result of nomi_repub
('hinduism', 0.0)
('buddhism', 0.0)
('islam', 28.92561983471074)
('non-religious', 35.53719008264463)
('christianity', 35.53719008264463)


In [29]:
# Testing trump
path = 'input/trump_speeches/'
root = '/Users/Sheon/nlp/input/trump_speeches/'
trump_speeches = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    trump_speeches.append(f.read())
predicted = text_clf.predict(trump_speeches)
result = {'buddhism': 0, 'christianity': 0,
         'hinduism': 0, 'islam': 0, 'non-religious': 0}
count = 0
for instance, category in zip(trump_speeches, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print "Number of instances in " + filename[:-3] + " " + str(count)
print "Result of " + filename[:-3]
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

Number of instances in trump 101
Result of trump
('hinduism', 0.0)
('buddhism', 4.9504950495049505)
('islam', 30.693069306930692)
('non-religious', 5.9405940594059405)
('christianity', 58.415841584158414)


In [30]:
# testing democrats nomination
path = '/Users/Sheon/nlp/input/presidents/'
target = open('/Users/Sheon/nlp/input/presidents/sou_rel_result.json', 'w')

for dirname in os.listdir(path):
    if (dirname == ".DS_Store" or dirname == "sou_rel_result.json" or dirname == 'sou_result.json'): continue 
    pathPres = path + dirname
    president = []
    for filename in os.listdir(pathPres):
        f = open(os.path.join(pathPres, filename), 'r')
        president.append(f.read())
    predicted = text_clf.predict(president)
    
    result = {"buddhism": 0, "christianity": 0, "hinduism": 0, 'islam': 0, 'non-religious': 0}
    count = 0
    for instance, category in zip(president, predicted):
        result[rel_train.target_names[category]] += 1
        count += 1
    print str(dirname)
    # print result
    # print "Count: " + str(count)
    for key, value in result.items():
        result[key] = value / float(count) * 100
        #print (key, result[key])
    target.write(str(result))
    target.write(', ')
    # print result


Carter
Clinton
Coolidge
Eisenhower
FDR
Ford
Harding
Hbush
Hoover
Johnson
Kennedy
Nixon
Obama
Reagan
Taft
TDR
Truman
Wbush
WWS


In [31]:
# testing democrats nomination
path = '/Users/Sheon/nlp/input/old_presidents/'
target = open('/Users/Sheon/nlp/input/old_presidents/sou_rel_result.json', 'w')

for dirname in os.listdir(path):
    if (dirname == ".DS_Store" or dirname == "sou_rel_result.json" or dirname == 'sou_result.json'): continue 
    pathPres = path + dirname
    president = []
    for filename in os.listdir(pathPres):
        f = open(os.path.join(pathPres, filename), 'r')
        president.append(f.read())
    predicted = text_clf.predict(president)
    result = {'buddhism': 0, 'christianity': 0, 'hinduism': 0, 'islam': 0, 'non-religious': 0}
    count = 0
    for instance, category in zip(president, predicted):
        result[rel_train.target_names[category]] += 1
        count += 1
    print str(dirname)
    # print result
    # print "Count: " + str(count)
    for key, value in result.items():
        result[key] = value / float(count) * 100
        #print (key, result[key])
    target.write(str(result))
    target.write(', ')
    # print result


Adams
J.Q.Adams
Jackson
Jefferson
Madison
Monroe
Washington


In [32]:
# testing trump nomination
path = 'input/trump_speeches/'
root = '/Users/Sheon/nlp/input/trump_speeches/'
trump_speeches = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    trump_speeches.append(f.read())
predicted = text_clf.predict(trump_speeches)

result = {'buddhism': 0, 'christianity': 0, 'hinduism': 0, 'islam': 0, 'non-religious': 0}

count = 0
for instance, category in zip(trump_speeches, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'hinduism': 0, 'buddhism': 5, 'islam': 31, 'non-religious': 6, 'christianity': 59}
Count: 101
('hinduism', 0.0)
('buddhism', 4.9504950495049505)
('islam', 30.693069306930692)
('non-religious', 5.9405940594059405)
('christianity', 58.415841584158414)


In [14]:
# testing hillary nomination
path = 'input/hillary/'
root = '/Users/Sheon/nlp/input/hillary/'
hillary = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    hillary.append(f.read())
predicted = text_clf.predict(hillary)

result = {'buddhism': 0, 'christianity': 0, 'hinduism': 0, 'islam': 0, 'non-religious': 0}

count = 0
for instance, category in zip(hillary, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'hinduism': 0, 'buddhism': 1, 'islam': 1, 'non-religious': 8, 'christianity': 91}
Count: 101
('hinduism', 0.0)
('buddhism', 0.9900990099009901)
('islam', 0.9900990099009901)
('non-religious', 7.920792079207921)
('christianity', 90.0990099009901)


In [15]:
# testing democrats nomination
path = 'input/nomination_democrats/'
root = '/Users/Sheon/nlp/input/nomination_democrats/'
democrats_nomination = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    democrats_nomination.append(f.read())
predicted = text_clf.predict(democrats_nomination)

result = {'buddhism': 0, 'christianity': 0, 'hinduism': 0, 'islam': 0, 'non-religious': 0}

count = 0
for instance, category in zip(democrats_nomination, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'hinduism': 0, 'buddhism': 2, 'islam': 12, 'non-religious': 38, 'christianity': 41}
Count: 93
('hinduism', 0.0)
('buddhism', 2.1505376344086025)
('islam', 12.903225806451612)
('non-religious', 40.86021505376344)
('christianity', 44.086021505376344)


In [16]:
# testing republican nomination
path = 'input/nomination_republican//'
root = '/Users/Sheon/nlp/input/nomination_republican/'
republican_nomination = []
for filename in os.listdir(path):
    f = open(os.path.join(root, filename), 'r')
    republican_nomination.append(f.read())
predicted = text_clf.predict(republican_nomination)

result = {'buddhism': 0, 'christianity': 0, 'hinduism': 0, 'islam': 0, 'non-religious': 0}

count = 0
for instance, category in zip(republican_nomination, predicted):
    result[rel_train.target_names[category]] += 1
    count += 1
print result
print "Count: " + str(count)
for key, value in result.items():
    result[key] = value / float(count) * 100
    print (key, result[key])

{'hinduism': 0, 'buddhism': 0, 'islam': 35, 'non-religious': 43, 'christianity': 43}
Count: 121
('hinduism', 0.0)
('buddhism', 0.0)
('islam', 28.92561983471074)
('non-religious', 35.53719008264463)
('christianity', 35.53719008264463)
