In [1]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import coremltools
import numpy as np
import json

In [2]:
classes = ['pos', 'neg']
training_data = []
training_labels = []
testing_data = []
testing_labels = []

In [3]:
for c in classes:
    dirname = os.path.join('review_polarity', 'txt_sentoken', c)
    for fname in os.listdir(dirname):
        with open(os.path.join(dirname, fname), 'r') as f:
            content = f.read()
            if fname.startswith('cv4'):
                testing_data.append(content)
                testing_labels.append(c)
            else:
                training_data.append(content)
                training_labels.append(c)

In [4]:
v = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True
                            )
training_feature_vectors = v.fit_transform(training_data)
testing_feature_vectors = v.transform(testing_data)

In [20]:
classifier = svm.LinearSVC()
classifier.fit(training_feature_vectors, training_labels)
prediction = classifier.predict(testing_feature_vectors)
print(classification_report(testing_labels, prediction))

             precision    recall  f1-score   support

        neg       0.89      0.86      0.87       100
        pos       0.86      0.89      0.88       100

avg / total       0.88      0.88      0.87       200



In [21]:
coreml_model = coremltools.converters.sklearn.convert(classifier)
coreml_model.author = "Sam Davies"
coreml_model.license = "MIT"
coreml_model.short_description = "SVM for sentiment analysis of movie reviews."
coreml_model.save('MovieReviewSentiment')

In [22]:
test = v.transform(["the single worst movie i've ever seen"])
print(test)

  (0, 12378)	0.518288536482
  (0, 11890)	0.376546750519
  (0, 10092)	0.53459075347
  (0, 9776)	0.354964292634
  (0, 7290)	0.206622202884
  (0, 3873)	0.367572958962


In [23]:
classifier.predict(test)

array(['neg'],
      dtype='|S3')

In [None]:
t = [1, 1, 1, 2, 1]
sublin = np.log(t) + 1
sublin

In [None]:
tfidf = [v.idf_[11924], v.idf_[8959], v.idf_[5268], v.idf_[1127], v.idf_[884]] * sublin
tfidf / np.sqrt(np.sum(np.power(tfidf, 2)))

In [None]:
feature_list = []
vocab = v.vocabulary_
for word in vocab:
    feature_list.append({ 'word': word, 'index': vocab[word], 'idf': v.idf_[vocab[word]] })

json.dumps(feature_list)

In [None]:
with open("features.json", "w") as f:
    json.dump(feature_list, f, indent=2)