In [10]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import coremltools
import numpy as np
import json

## Importing the data

In [11]:
classes = ['pos', 'neg']
training_data = []
training_labels = []
testing_data = []
testing_labels = []

In [12]:
for c in classes:
    dirname = os.path.join('review_polarity', 'txt_sentoken', c)
    for fname in os.listdir(dirname):
        with open(os.path.join(dirname, fname), 'r') as f:
            content = f.read()
            if fname.startswith('cv4'):
                testing_data.append(content)
                testing_labels.append(c)
            else:
                training_data.append(content)
                training_labels.append(c)

## Creating Feature Vectors

In [13]:
v = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True
                            )
training_feature_vectors = v.fit_transform(training_data)
testing_feature_vectors = v.transform(testing_data)

In [14]:
v.vocabulary_

{u'askew': 726,
 u'woods': 12354,
 u'hanging': 5077,
 u'woody': 12355,
 u'comically': 2197,
 u'originality': 7793,
 u'rickman': 9332,
 u'bringing': 1429,
 u'four': 4489,
 u'wooden': 12353,
 u'crotch': 2666,
 u'stereotypical': 10592,
 u'frederick': 4529,
 u'cooking': 2476,
 u'joely': 6039,
 u'designing': 3023,
 u'succumb': 10791,
 u'shocks': 9969,
 u'china': 1904,
 u'confronts': 2337,
 u'wiseguy': 12308,
 u'natured': 7407,
 u'kids': 6183,
 u'uplifting': 11809,
 u'controversy': 2446,
 u'appropriately': 638,
 u'stern': 10594,
 u'dna': 3287,
 u'catchy': 1727,
 u'insecurity': 5758,
 u'music': 7337,
 u'therefore': 11204,
 u'violently': 11975,
 u'boorman': 1301,
 u'circumstances': 1981,
 u'morally': 7237,
 u'locked': 6584,
 u'locker': 6585,
 u'gershon': 4718,
 u'deputy': 2994,
 u'wang': 12076,
 u'want': 12080,
 u'absolute': 157,
 u'travel': 11471,
 u'copious': 2489,
 u'dared': 2785,
 u'dinosaurs': 3148,
 u'wrong': 12410,
 u'subplots': 10764,
 u'sickening': 10031,
 u'18th': 21,
 u'concoction':

In [15]:
v.idf_

array([ 6.29887277,  4.11407071,  6.29887277, ...,  6.0111907 ,
        6.4166558 ,  6.4166558 ])

In [16]:
print(testing_feature_vectors[0])

  (0, 12453)	0.0469975971133
  (0, 12448)	0.0864520637292
  (0, 12427)	0.0451526429498
  (0, 12340)	0.0608668341686
  (0, 12267)	0.0336913066196
  (0, 12218)	0.0358243156147
  (0, 12202)	0.0280810353492
  (0, 12183)	0.0335623550247
  (0, 11996)	0.0740112739841
  (0, 11743)	0.132527555865
  (0, 11601)	0.0982790437449
  (0, 11511)	0.0584431123578
  (0, 11265)	0.105449737048
  (0, 11237)	0.0428791550429
  (0, 11213)	0.0267758232077
  (0, 11196)	0.0383648211069
  (0, 11187)	0.0295977392763
  (0, 11182)	0.210209656211
  (0, 11092)	0.0820841016605
  (0, 11014)	0.0700191120222
  (0, 10835)	0.066996901277
  (0, 10644)	0.033264562512
  (0, 10632)	0.0660459245036
  (0, 10516)	0.0747849104956
  (0, 10471)	0.125950314397
  :	:
  (0, 2420)	0.0871467824783
  (0, 2412)	0.0886333001106
  (0, 2370)	0.119544473746
  (0, 2195)	0.0616865322227
  (0, 1714)	0.115778862648
  (0, 1608)	0.0286811844997
  (0, 1438)	0.105228433661
  (0, 1185)	0.163988224419
  (0, 1136)	0.0429128203805
  (0, 1128)	0.0998096460898

## Training & Testing the model

In [17]:
classifier = svm.LinearSVC()
classifier.fit(training_feature_vectors, training_labels)
prediction = classifier.predict(testing_feature_vectors)
print(classification_report(testing_labels, prediction))

             precision    recall  f1-score   support

        neg       0.89      0.86      0.87       100
        pos       0.86      0.89      0.88       100

avg / total       0.88      0.88      0.87       200



## Exporting the model

In [18]:
coreml_model = coremltools.converters.sklearn.convert(classifier)
coreml_model.author = "Sam Davies"
coreml_model.license = "MIT"
coreml_model.short_description = "SVM for sentiment analysis of movie reviews."
coreml_model.save('MovieReviewSentiment')

## Exporting the feature weights

In [19]:
feature_list = []
vocab = v.vocabulary_
for word in vocab:
    feature_list.append({ 'word': word, 'index': vocab[word], 'idf': v.idf_[vocab[word]] })

json.dumps(feature_list)



In [20]:
with open("features.json", "w") as f:
    json.dump(feature_list, f, indent=2)

## Appendix: Creating Feature Vectors

In [21]:
test = v.transform(["the single worst movie i've ever seen"])
print(test)

  (0, 12378)	0.518288536482
  (0, 11890)	0.376546750519
  (0, 10092)	0.53459075347
  (0, 9776)	0.354964292634
  (0, 7290)	0.206622202884
  (0, 3873)	0.367572958962


In [22]:
classifier.predict(test)

array(['neg'],
      dtype='|S3')

In [23]:
t = [1, 1, 1, 2, 1]
sublin = np.log(t) + 1
sublin

array([ 1.        ,  1.        ,  1.        ,  1.69314718,  1.        ])

In [24]:
tfidf = [v.idf_[11924], v.idf_[8959], v.idf_[5268], v.idf_[1127], v.idf_[884]] * sublin
tfidf / np.sqrt(np.sum(np.power(tfidf, 2)))

array([ 0.18708995,  0.59565906,  0.41403879,  0.34203943,  0.56724646])