In [32]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import coremltools

In [3]:
classes = ['pos', 'neg']
training_data = []
training_labels = []
testing_data = []
testing_labels = []

In [5]:
for c in classes:
    dirname = os.path.join('review_polarity', 'txt_sentoken', c)
    for fname in os.listdir(dirname):
        with open(os.path.join(dirname, fname), 'r') as f:
            content = f.read()
            if fname.startswith('cv4'):
                testing_data.append(content)
                testing_labels.append(c)
            else:
                training_data.append(content)
                training_labels.append(c)

In [13]:
v = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True
                            )
training_feature_vectors = v.fit_transform(training_data)
testing_feature_vectors = v.transform(testing_data)

In [14]:
training_feature_vectors

<1800x12482 sparse matrix of type '<type 'numpy.float64'>'
	with 508736 stored elements in Compressed Sparse Row format>

In [15]:
v.vocabulary_

{u'askew': 726,
 u'woods': 12354,
 u'hanging': 5077,
 u'woody': 12355,
 u'comically': 2197,
 u'originality': 7793,
 u'rickman': 9332,
 u'bringing': 1429,
 u'four': 4489,
 u'wooden': 12353,
 u'crotch': 2666,
 u'stereotypical': 10592,
 u'frederick': 4529,
 u'cooking': 2476,
 u'joely': 6039,
 u'designing': 3023,
 u'succumb': 10791,
 u'shocks': 9969,
 u'china': 1904,
 u'confronts': 2337,
 u'wiseguy': 12308,
 u'natured': 7407,
 u'kids': 6183,
 u'uplifting': 11809,
 u'controversy': 2446,
 u'appropriately': 638,
 u'stern': 10594,
 u'dna': 3287,
 u'catchy': 1727,
 u'insecurity': 5758,
 u'music': 7337,
 u'therefore': 11204,
 u'violently': 11975,
 u'boorman': 1301,
 u'circumstances': 1981,
 u'morally': 7237,
 u'locked': 6584,
 u'locker': 6585,
 u'gershon': 4718,
 u'deputy': 2994,
 u'wang': 12076,
 u'want': 12080,
 u'absolute': 157,
 u'travel': 11471,
 u'copious': 2489,
 u'dared': 2785,
 u'dinosaurs': 3148,
 u'wrong': 12410,
 u'subplots': 10764,
 u'sickening': 10031,
 u'18th': 21,
 u'concoction':

In [16]:
v.idf_

array([ 6.29887277,  4.11407071,  6.29887277, ...,  6.0111907 ,
        6.4166558 ,  6.4166558 ])

In [22]:
v.get_feature_names()

[u'00',
 u'000',
 u'007',
 u'10',
 u'100',
 u'1000',
 u'101',
 u'105',
 u'11',
 u'12',
 u'13',
 u'137',
 u'13th',
 u'14',
 u'15',
 u'16',
 u'16x9',
 u'17',
 u'17th',
 u'18',
 u'180',
 u'18th',
 u'19',
 u'1900',
 u'1939',
 u'1940',
 u'1947',
 u'1950',
 u'1950s',
 u'1960',
 u'1960s',
 u'1961',
 u'1962',
 u'1963',
 u'1964',
 u'1966',
 u'1967',
 u'1968',
 u'1969',
 u'1970',
 u'1970s',
 u'1971',
 u'1972',
 u'1973',
 u'1974',
 u'1975',
 u'1976',
 u'1977',
 u'1978',
 u'1979',
 u'1980',
 u'1980s',
 u'1981',
 u'1982',
 u'1983',
 u'1984',
 u'1985',
 u'1986',
 u'1987',
 u'1988',
 u'1989',
 u'1990',
 u'1990s',
 u'1991',
 u'1992',
 u'1993',
 u'1994',
 u'1995',
 u'1996',
 u'1997',
 u'1998',
 u'1999',
 u'19th',
 u'20',
 u'200',
 u'2000',
 u'2001',
 u'20th',
 u'21',
 u'22',
 u'23',
 u'24',
 u'25',
 u'25th',
 u'26',
 u'27',
 u'28',
 u'29',
 u'30',
 u'300',
 u'3000',
 u'31',
 u'33',
 u'35',
 u'36',
 u'37',
 u'40',
 u'400',
 u'45',
 u'48',
 u'4th',
 u'50',
 u'500',
 u'50s',
 u'54',
 u'57',
 u'60',
 u'600

In [20]:
len(v.idf_)

12482

In [31]:
classifier = svm.LinearSVC()
classifier.fit(training_feature_vectors, training_labels)
prediction = classifier.predict(testing_feature_vectors)
print(classification_report(testing_labels, prediction))

             precision    recall  f1-score   support

        neg       0.89      0.86      0.87       100
        pos       0.86      0.89      0.88       100

avg / total       0.88      0.88      0.87       200



In [36]:
coreml_model = coremltools.converters.sklearn.convert(classifier)
coreml_model.author = "Sam Davies"
coreml_model.license = "MIT"
coreml_model.short_description = "SVM for sentiment analysis of movie reviews."
coreml_model.save('MovieReviewSentiment')