In [3]:
import json
POSITIVE_TEXT = json.load(open("POSITIVE_list.json"))
NEGATIVE_TEXT = json.load(open("NEGATIVE_list.json"))

In [4]:
print POSITIVE_TEXT[0]

My husband is a BMW mechanic and he drives cars all day.  I wish I could get him one for every car



In [5]:
print NEGATIVE_TEXT[0]

Far better models for similair price range - dissapointed with the quality of the finish etc etc. works fine though.



In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
print {
    'min_df': vectorizer.min_df,
    'max_df': vectorizer.max_df,
    'ngram_range': vectorizer.ngram_range,
}

{'ngram_range': (1, 1), 'max_df': 1.0, 'min_df': 1}


In [82]:
text_features = vectorizer.fit_transform(POSITIVE_TEXT + NEGATIVE_TEXT)


In [27]:
target = [1] * len(POSITIVE_TEXT) + [-1] * len(NEGATIVE_TEXT)

In [20]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='log', penalty='elasticnet')
print {
    'alpha': classifier.alpha,
    'l1_ratio': classifier.l1_ratio,
}

{'alpha': 0.0001, 'l1_ratio': 0.15}


In [60]:
from sklearn import cross_validation
cv_split = cross_validation.ShuffleSplit(
    text_features.shape[0], 
    n_iter=5, 
    test_size=0.3, 
    random_state=0
)

In [32]:
cv_scores = cross_validation.cross_val_score(classifier, text_features, target, cv=cv_split)

In [34]:
import numpy
print numpy.mean(cv_scores)

0.833289670659


In [61]:
import math
def sentiment_metric(positive, negative, assignments):
    min_ngram = assignments['min_ngram']
    max_ngram = min_ngram + assignments['ngram_offset']
    min_doc_frequency = math.exp(assignments['log_min_df'])
    max_doc_frequency = min_doc_frequency + assignments['df_offset']
    vectorizer = CountVectorizer(
        min_df=min_doc_frequency, 
        max_df=max_doc_frequency,                          
        ngram_range=(min_ngram, max_ngram),
    )
    text_features = vectorizer.fit_transform(positive + negative)
    target = [1] * len(positive) + [-1] * len(negative)
    
    alpha = math.exp(assignments['log_reg_coef'])
    l1_ratio = assignments['l1_coef']
    classifier = SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        alpha=alpha, 
        l1_ratio=l1_ratio
    )
    cv = cross_validation.ShuffleSplit(
        text_features.shape[0], 
        n_iter=5, 
        test_size=0.3, 
        random_state=0
    )
    cv_scores = cross_validation.cross_val_score(classifier, text_features, target, cv=cv)
    return numpy.mean(cv_scores)

In [94]:
# insert your client token into sigopt_creds.py
# find your client token at sigopt.com/user/profile
from sigopt_creds import client_token
try:
    assert client_token
except AssertionError:
    raise Exception("Insert your client token into sigopt_creds.py. "
                    "Find it at sigopt.com/user/profile")

In [46]:
import sigopt.interface
conn = sigopt.interface.Connection(client_token=client_token)

In [54]:
experiment = conn.experiments().create(
    name='Sentiment LR Classifier',
    parameters=[{ 
        'name':'l1_coef', 
        'type': 'double', 
        'bounds': { 'min': 0, 'max': 1.0 }
    }, { 
        'name':'log_reg_coef', 
        'type': 'double', 
        'bounds': { 'min': math.log(0.000001), 'max': math.log(100.0) }
    }, { 
        'name':'min_ngram', 
        'type': 'int',
        'bounds': { 'min': 1, 'max': 2 }
    }, { 
        'name':'ngram_offset',
        'type': 'int',
        'bounds': { 'min': 0, 'max': 2 }
    }, { 
        'name':'log_min_df', 
        'type': 'double',
        'bounds': { 'min': math.log(0.00000001), 'max': math.log(0.1) }
    }, { 
        'name':'df_offset', 
        'type': 'double',
        'bounds': { 'min': 0.01, 'max': 0.25 }
    }],
)
print "View your experiment details at https://sigopt.com/experiment/{0}".format(experiment.id)

View your experiment details at https://sigopt.com/experiment/2294


In [79]:
for _ in range(60):
    suggestion = conn.experiments(experiment.id).suggestions().create()
    opt_metric = sentiment_metric(POSITIVE_TEXT, NEGATIVE_TEXT, suggestion.assignments)
    conn.experiments(experiment.id).observations().create(
      suggestion=suggestion.id,
      value=opt_metric,
    ) 

In [80]:
experiment = conn.experiments(experiment.id).fetch()

In [81]:
best_observation = experiment.progress.best_observation
print "Best value: {value}, found at:\n{assignments}".format(
    value=best_observation.value, 
    assignments=json.dumps(
        best_observation.assignments.to_json(),
        sort_keys=True,
        indent=4, 
        separators=(',', ': ')
    )
)

Best value: 0.862836826347, found at:
{
    "df_offset": 0.1090952897880146,
    "l1_coef": 0.01912058464824784,
    "log_min_df": -15.45840961635736,
    "log_reg_coef": -11.526470082037305,
    "min_ngram": 1,
    "ngram_offset": 1
}
