In [6]:
import numpy as np
import re

In [7]:
import pickle
def save_obj(obj, name ):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
data = load_obj('posts_training_data')

In [9]:
features = np.array([list(feature.values()) for feature in data['feature']])
target = data['sentiment']
print(features.shape)

(338, 11)


In [10]:
for key in data['feature'][0].keys():
    print(key)

num_slang
question_mark
neg
pos
number_number
pos_strength
neg_strength
pos_vs_neg_strength
pos_vs_neg
exclamation
avg_word_len


In [43]:
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import csv
import os.path
from sklearn import preprocessing
def grid_search(clf, features_dict_list, target, param_grid, cv=None, 
                scoring=None, feature_scaled=False, output=True, output_file=None, minmax=False):
    if not cv:
        cv = 10
    if not scoring:
        scoring = 'accuracy'
    if not output_file:
        output_file = 'data/classifier_selection.csv'
    features = np.array([list(feature.values()) for feature in features_dict_list])
    if feature_scaled:
        features = preprocessing.scale(features)
    if minmax:
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)
    feature_names = ', '.join(features_dict_list[0].keys())
    gs_clf = GridSearchCV(estimator=clf, param_grid=param_grid,
                   n_jobs=-1, cv=cv,scoring=scoring)
    gs_clf.fit(features, target)
    grid_scores = gs_clf.grid_scores_
    # write training data to .csv
    file_exist = os.path.isfile(output_file) 
    if not file_exist and output:
        with open(output_file, 'w') as csvfile:
            writer = csv.writer(csvfile)

            writer.writerow(["classifier", "parameters", "features", "feature_scaled", 
                             "cv", "avg_accuracy", "std_accuracy"])
            for score in grid_scores: 
                params = score[0]
                avg_accuracy = score[1]
                std_accuracy = np.std(score[2])
                writer.writerow([type(clf).__name__, params, feature_names,
                                 feature_scaled, cv, avg_accuracy, std_accuracy])
    elif file_exist and output:
        with open(output_file, 'a') as csvfile:
            writer = csv.writer(csvfile)
            for score in grid_scores: 
                params = score[0]
                avg_accuracy = score[1]
                std_accuracy = np.std(score[2])
                writer.writerow([type(clf).__name__, params, feature_names,
                                 feature_scaled, cv, avg_accuracy, std_accuracy])
    else:
        pass
    print(gs_clf.best_score_)
    print(gs_clf.best_params_)
    return gs_clf

### Get subsets of feature dictionary

In [12]:
from itertools import chain, combinations
def all_subsets(ss):
  return(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

In [15]:
for subset in all_subsets(data['feature'][0].keys()):
  print(list(subset))

[]
['num_slang']
['question_mark']
['neg']
['pos']
['number_number']
['pos_strength']
['neg_strength']
['pos_vs_neg_strength']
['pos_vs_neg']
['exclamation']
['avg_word_len']
['num_slang', 'question_mark']
['num_slang', 'neg']
['num_slang', 'pos']
['num_slang', 'number_number']
['num_slang', 'pos_strength']
['num_slang', 'neg_strength']
['num_slang', 'pos_vs_neg_strength']
['num_slang', 'pos_vs_neg']
['num_slang', 'exclamation']
['num_slang', 'avg_word_len']
['question_mark', 'neg']
['question_mark', 'pos']
['question_mark', 'number_number']
['question_mark', 'pos_strength']
['question_mark', 'neg_strength']
['question_mark', 'pos_vs_neg_strength']
['question_mark', 'pos_vs_neg']
['question_mark', 'exclamation']
['question_mark', 'avg_word_len']
['neg', 'pos']
['neg', 'number_number']
['neg', 'pos_strength']
['neg', 'neg_strength']
['neg', 'pos_vs_neg_strength']
['neg', 'pos_vs_neg']
['neg', 'exclamation']
['neg', 'avg_word_len']
['pos', 'number_number']
['pos', 'pos_strength']
['pos',

In [16]:
subsets_tuple = all_subsets(data['feature'][0].keys())
subsets = [subset for idx, subset in enumerate(subsets_tuple)]
subsets.pop(0)

()

In [17]:
subsets

[('num_slang',),
 ('question_mark',),
 ('neg',),
 ('pos',),
 ('number_number',),
 ('pos_strength',),
 ('neg_strength',),
 ('pos_vs_neg_strength',),
 ('pos_vs_neg',),
 ('exclamation',),
 ('avg_word_len',),
 ('num_slang', 'question_mark'),
 ('num_slang', 'neg'),
 ('num_slang', 'pos'),
 ('num_slang', 'number_number'),
 ('num_slang', 'pos_strength'),
 ('num_slang', 'neg_strength'),
 ('num_slang', 'pos_vs_neg_strength'),
 ('num_slang', 'pos_vs_neg'),
 ('num_slang', 'exclamation'),
 ('num_slang', 'avg_word_len'),
 ('question_mark', 'neg'),
 ('question_mark', 'pos'),
 ('question_mark', 'number_number'),
 ('question_mark', 'pos_strength'),
 ('question_mark', 'neg_strength'),
 ('question_mark', 'pos_vs_neg_strength'),
 ('question_mark', 'pos_vs_neg'),
 ('question_mark', 'exclamation'),
 ('question_mark', 'avg_word_len'),
 ('neg', 'pos'),
 ('neg', 'number_number'),
 ('neg', 'pos_strength'),
 ('neg', 'neg_strength'),
 ('neg', 'pos_vs_neg_strength'),
 ('neg', 'pos_vs_neg'),
 ('neg', 'exclamation')

### SVM

In [62]:
from sklearn import svm
Cs = np.logspace(-1, 2, 10)
gammas = [0.001, 0.0001]
svc = svm.SVC(probability=True)
#clf = grid_search(svc,data['feature'],data['sentiment'],dict(C=Cs))

In [63]:
clf = grid_search(svc,data['feature'],data['sentiment'],dict(C=Cs), cv=5, feature_scaled=True,output=False)

0.772189349112


In [58]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(svc,subset_feature,data['sentiment'],dict(C=Cs))
    grid_search(svc,subset_feature,data['sentiment'],dict(C=Cs),cv=5)
    grid_search(svc,subset_feature,data['sentiment'],dict(C=Cs),feature_scaled=True)
    grid_search(svc,subset_feature,data['sentiment'],dict(C=Cs),cv=5,feature_scaled=True)

('exclamation',)
0.621301775148
0.621301775148




0.621301775148




0.621301775148
('question_mark',)
0.674556213018
0.674556213018




0.674556213018




0.674556213018
('pos_vs_neg',)
0.730769230769
0.730769230769
0.730769230769
0.736686390533
('pos_strength',)
0.650887573964
0.633136094675




0.621301775148




0.621301775148
('pos_vs_neg_strength',)
0.751479289941
0.751479289941
0.718934911243
0.727810650888
('pos',)
0.621301775148
0.621301775148
0.647928994083
0.650887573964
('num_slang',)
0.621301775148
0.621301775148




0.630177514793




0.621301775148
('avg_word_len',)
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('neg',)
0.698224852071
0.695266272189
0.698224852071
0.710059171598
('neg_strength',)
0.671597633136
0.656804733728




0.686390532544




0.686390532544
('number_number',)
0.621301775148
0.621301775148




0.62426035503




0.621301775148
('exclamation', 'question_mark')
0.686390532544
0.683431952663




0.692307692308




0.692307692308
('exclamation', 'pos_vs_neg')
0.739644970414
0.733727810651
0.733727810651
0.727810650888
('exclamation', 'pos_strength')
0.636094674556
0.644970414201




0.633136094675




0.627218934911
('exclamation', 'pos_vs_neg_strength')
0.733727810651
0.739644970414
0.721893491124
0.727810650888
('exclamation', 'pos')
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('exclamation', 'num_slang')
0.621301775148
0.621301775148




0.621301775148




0.621301775148
('exclamation', 'avg_word_len')
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('exclamation', 'neg')
0.689349112426
0.680473372781
0.707100591716
0.718934911243
('exclamation', 'neg_strength')
0.662721893491
0.656804733728




0.695266272189




0.704142011834
('exclamation', 'number_number')
0.621301775148
0.621301775148




0.627218934911




0.630177514793
('question_mark', 'pos_vs_neg')
0.751479289941
0.739644970414
0.745562130178
0.736686390533
('question_mark', 'pos_strength')
0.662721893491
0.633136094675




0.683431952663




0.683431952663
('question_mark', 'pos_vs_neg_strength')
0.766272189349
0.760355029586
0.713017751479
0.721893491124
('question_mark', 'pos')
0.677514792899
0.674556213018
0.674556213018
0.677514792899
('question_mark', 'num_slang')
0.686390532544
0.665680473373




0.692307692308




0.668639053254
('question_mark', 'avg_word_len')
0.677514792899
0.677514792899
0.686390532544
0.677514792899
('question_mark', 'neg')
0.692307692308
0.692307692308
0.745562130178
0.742603550296
('question_mark', 'neg_strength')
0.701183431953
0.695266272189




0.713017751479




0.701183431953
('question_mark', 'number_number')
0.668639053254
0.656804733728




0.686390532544




0.686390532544
('pos_vs_neg', 'pos_strength')
0.727810650888
0.724852071006
0.724852071006
0.724852071006
('pos_vs_neg', 'pos_vs_neg_strength')
0.745562130178
0.745562130178
0.736686390533
0.727810650888
('pos_vs_neg', 'pos')
0.730769230769
0.733727810651
0.718934911243
0.730769230769
('pos_vs_neg', 'num_slang')
0.730769230769
0.733727810651
0.730769230769
0.730769230769
('pos_vs_neg', 'avg_word_len')
0.736686390533
0.733727810651
0.715976331361
0.724852071006
('pos_vs_neg', 'neg')
0.730769230769
0.733727810651
0.718934911243
0.730769230769
('pos_vs_neg', 'neg_strength')
0.692307692308
0.707100591716
0.742603550296
0.739644970414
('pos_vs_neg', 'number_number')
0.650887573964
0.621301775148
0.724852071006
0.724852071006
('pos_strength', 'pos_vs_neg_strength')
0.727810650888
0.739644970414
0.742603550296
0.745562130178
('pos_strength', 'pos')
0.647928994083
0.62426035503
0.633136094675
0.644970414201
('pos_strength', 'num_slang')
0.644970414201
0.630177514793




0.621301775148




0.621301775148
('pos_strength', 'avg_word_len')
0.647928994083
0.636094674556
0.621301775148
0.621301775148
('pos_strength', 'neg')
0.647928994083
0.62426035503
0.727810650888
0.733727810651
('pos_strength', 'neg_strength')
0.683431952663
0.683431952663




0.766272189349




0.760355029586
('pos_strength', 'number_number')
0.621301775148
0.621301775148




0.627218934911




0.627218934911
('pos_vs_neg_strength', 'pos')
0.727810650888
0.721893491124
0.751479289941
0.733727810651
('pos_vs_neg_strength', 'num_slang')
0.713017751479
0.715976331361
0.713017751479
0.727810650888
('pos_vs_neg_strength', 'avg_word_len')
0.727810650888
0.733727810651
0.710059171598
0.736686390533
('pos_vs_neg_strength', 'neg')
0.775147928994
0.778106508876
0.760355029586
0.763313609467
('pos_vs_neg_strength', 'neg_strength')
0.686390532544
0.686390532544
0.754437869822
0.760355029586
('pos_vs_neg_strength', 'number_number')
0.665680473373
0.633136094675
0.710059171598
0.713017751479
('pos', 'num_slang')
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('pos', 'avg_word_len')
0.621301775148
0.621301775148
0.64201183432
0.656804733728
('pos', 'neg')
0.707100591716
0.707100591716
0.715976331361
0.736686390533
('pos', 'neg_strength')
0.680473372781
0.665680473373
0.698224852071
0.713017751479
('pos', 'number_number')
0.621301775148
0.621301775148
0.627218934911
0.6213017751



0.677514792899




0.677514792899
('num_slang', 'number_number')
0.621301775148
0.621301775148




0.636094674556




0.639053254438
('avg_word_len', 'neg')
0.695266272189
0.695266272189
0.698224852071
0.704142011834
('avg_word_len', 'neg_strength')
0.662721893491
0.656804733728
0.656804733728
0.665680473373
('avg_word_len', 'number_number')
0.627218934911
0.621301775148
0.621301775148
0.621301775148
('neg', 'neg_strength')
0.680473372781
0.665680473373
0.727810650888
0.721893491124
('neg', 'number_number')
0.639053254438
0.621301775148
0.701183431953
0.704142011834
('neg_strength', 'number_number')
0.639053254438
0.621301775148




0.680473372781




0.680473372781
('exclamation', 'question_mark', 'pos_vs_neg')
0.742603550296
0.736686390533
0.736686390533
0.736686390533
('exclamation', 'question_mark', 'pos_strength')
0.647928994083
0.639053254438




0.686390532544




0.686390532544
('exclamation', 'question_mark', 'pos_vs_neg_strength')
0.718934911243
0.715976331361
0.704142011834
0.695266272189
('exclamation', 'question_mark', 'pos')
0.689349112426
0.689349112426
0.689349112426
0.680473372781
('exclamation', 'question_mark', 'num_slang')
0.686390532544
0.683431952663




0.683431952663




0.671597633136
('exclamation', 'question_mark', 'avg_word_len')
0.689349112426
0.689349112426
0.677514792899
0.680473372781
('exclamation', 'question_mark', 'neg')
0.701183431953
0.689349112426
0.730769230769
0.724852071006
('exclamation', 'question_mark', 'neg_strength')
0.713017751479
0.701183431953




0.730769230769




0.724852071006
('exclamation', 'question_mark', 'number_number')
0.644970414201
0.656804733728




0.698224852071




0.698224852071
('exclamation', 'pos_vs_neg', 'pos_strength')
0.718934911243
0.736686390533
0.713017751479
0.730769230769
('exclamation', 'pos_vs_neg', 'pos_vs_neg_strength')
0.748520710059
0.748520710059
0.724852071006
0.715976331361
('exclamation', 'pos_vs_neg', 'pos')
0.733727810651
0.730769230769
0.727810650888
0.724852071006
('exclamation', 'pos_vs_neg', 'num_slang')
0.724852071006
0.721893491124
0.724852071006
0.721893491124
('exclamation', 'pos_vs_neg', 'avg_word_len')
0.721893491124
0.724852071006
0.707100591716
0.710059171598
('exclamation', 'pos_vs_neg', 'neg')
0.733727810651
0.730769230769
0.707100591716
0.730769230769
('exclamation', 'pos_vs_neg', 'neg_strength')
0.692307692308
0.713017751479
0.724852071006
0.733727810651
('exclamation', 'pos_vs_neg', 'number_number')
0.653846153846
0.621301775148
0.718934911243
0.713017751479
('exclamation', 'pos_strength', 'pos_vs_neg_strength')
0.745562130178
0.757396449704
0.766272189349
0.778106508876
('exclamation', 'pos_strength', 'po



0.621301775148




0.621301775148
('exclamation', 'pos_strength', 'avg_word_len')
0.636094674556
0.644970414201
0.621301775148
0.621301775148
('exclamation', 'pos_strength', 'neg')
0.64201183432
0.64201183432
0.733727810651
0.742603550296
('exclamation', 'pos_strength', 'neg_strength')
0.692307692308
0.701183431953




0.754437869822




0.760355029586
('exclamation', 'pos_strength', 'number_number')
0.621301775148
0.621301775148




0.627218934911




0.627218934911
('exclamation', 'pos_vs_neg_strength', 'pos')
0.718934911243
0.721893491124
0.721893491124
0.710059171598
('exclamation', 'pos_vs_neg_strength', 'num_slang')
0.707100591716
0.715976331361
0.701183431953
0.707100591716
('exclamation', 'pos_vs_neg_strength', 'avg_word_len')
0.745562130178
0.754437869822
0.715976331361
0.727810650888
('exclamation', 'pos_vs_neg_strength', 'neg')
0.754437869822
0.757396449704
0.763313609467
0.760355029586
('exclamation', 'pos_vs_neg_strength', 'neg_strength')
0.698224852071
0.713017751479
0.748520710059
0.751479289941
('exclamation', 'pos_vs_neg_strength', 'number_number')
0.668639053254
0.674556213018
0.715976331361
0.710059171598
('exclamation', 'pos', 'num_slang')
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('exclamation', 'pos', 'avg_word_len')
0.621301775148
0.621301775148
0.633136094675
0.636094674556
('exclamation', 'pos', 'neg')
0.689349112426
0.689349112426
0.721893491124
0.736686390533
('exclamation', 'pos', 'neg_st



0.686390532544




0.692307692308
('exclamation', 'num_slang', 'number_number')
0.621301775148
0.621301775148




0.62426035503




0.636094674556
('exclamation', 'avg_word_len', 'neg')
0.689349112426
0.671597633136
0.715976331361
0.715976331361
('exclamation', 'avg_word_len', 'neg_strength')
0.659763313609
0.665680473373
0.668639053254
0.683431952663
('exclamation', 'avg_word_len', 'number_number')
0.621301775148
0.621301775148
0.621301775148
0.627218934911
('exclamation', 'neg', 'neg_strength')
0.677514792899
0.668639053254
0.715976331361
0.721893491124
('exclamation', 'neg', 'number_number')
0.621301775148
0.621301775148
0.707100591716
0.721893491124
('exclamation', 'neg_strength', 'number_number')
0.674556213018
0.683431952663




0.689349112426




0.701183431953
('question_mark', 'pos_vs_neg', 'pos_strength')
0.698224852071
0.692307692308
0.739644970414
0.742603550296
('question_mark', 'pos_vs_neg', 'pos_vs_neg_strength')
0.772189349112
0.757396449704
0.763313609467
0.748520710059
('question_mark', 'pos_vs_neg', 'pos')
0.748520710059
0.736686390533
0.754437869822
0.754437869822
('question_mark', 'pos_vs_neg', 'num_slang')
0.751479289941
0.739644970414
0.748520710059
0.724852071006
('question_mark', 'pos_vs_neg', 'avg_word_len')
0.739644970414
0.730769230769
0.730769230769
0.736686390533
('question_mark', 'pos_vs_neg', 'neg')
0.745562130178
0.736686390533
0.742603550296
0.739644970414
('question_mark', 'pos_vs_neg', 'neg_strength')
0.748520710059
0.751479289941
0.766272189349
0.769230769231
('question_mark', 'pos_vs_neg', 'number_number')
0.656804733728
0.644970414201
0.733727810651
0.739644970414
('question_mark', 'pos_strength', 'pos_vs_neg_strength')
0.748520710059
0.733727810651
0.736686390533
0.757396449704
('question_mark',



0.680473372781




0.674556213018
('question_mark', 'pos_strength', 'avg_word_len')
0.710059171598
0.683431952663
0.689349112426
0.698224852071
('question_mark', 'pos_strength', 'neg')
0.656804733728
0.636094674556
0.727810650888
0.721893491124
('question_mark', 'pos_strength', 'neg_strength')
0.707100591716
0.677514792899




0.784023668639




0.781065088757
('question_mark', 'pos_strength', 'number_number')
0.627218934911
0.621301775148




0.665680473373




0.671597633136
('question_mark', 'pos_vs_neg_strength', 'pos')
0.733727810651
0.742603550296
0.781065088757
0.778106508876
('question_mark', 'pos_vs_neg_strength', 'num_slang')
0.736686390533
0.742603550296
0.710059171598
0.695266272189
('question_mark', 'pos_vs_neg_strength', 'avg_word_len')
0.742603550296
0.754437869822
0.698224852071
0.721893491124
('question_mark', 'pos_vs_neg_strength', 'neg')
0.766272189349
0.778106508876
0.789940828402
0.781065088757
('question_mark', 'pos_vs_neg_strength', 'neg_strength')
0.724852071006
0.707100591716
0.789940828402
0.789940828402
('question_mark', 'pos_vs_neg_strength', 'number_number')
0.671597633136
0.665680473373
0.713017751479
0.710059171598
('question_mark', 'pos', 'num_slang')
0.683431952663
0.665680473373
0.671597633136
0.656804733728
('question_mark', 'pos', 'avg_word_len')
0.677514792899
0.674556213018
0.665680473373
0.668639053254
('question_mark', 'pos', 'neg')
0.724852071006
0.710059171598
0.754437869822
0.742603550296
('question_m



0.724852071006




0.707100591716
('question_mark', 'num_slang', 'number_number')
0.656804733728
0.647928994083




0.701183431953




0.689349112426
('question_mark', 'avg_word_len', 'neg')
0.698224852071
0.704142011834
0.736686390533
0.742603550296
('question_mark', 'avg_word_len', 'neg_strength')
0.710059171598
0.689349112426
0.718934911243
0.715976331361
('question_mark', 'avg_word_len', 'number_number')
0.665680473373
0.653846153846
0.686390532544
0.683431952663
('question_mark', 'neg', 'neg_strength')
0.707100591716
0.695266272189
0.757396449704
0.745562130178
('question_mark', 'neg', 'number_number')
0.662721893491
0.662721893491
0.736686390533
0.745562130178
('question_mark', 'neg_strength', 'number_number')
0.647928994083
0.647928994083




0.704142011834




0.692307692308
('pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength')
0.760355029586
0.766272189349
0.754437869822
0.763313609467
('pos_vs_neg', 'pos_strength', 'pos')
0.724852071006
0.724852071006
0.742603550296
0.754437869822
('pos_vs_neg', 'pos_strength', 'num_slang')
0.707100591716
0.701183431953
0.713017751479
0.721893491124
('pos_vs_neg', 'pos_strength', 'avg_word_len')
0.730769230769
0.730769230769
0.733727810651
0.730769230769
('pos_vs_neg', 'pos_strength', 'neg')
0.718934911243
0.724852071006
0.736686390533
0.751479289941
('pos_vs_neg', 'pos_strength', 'neg_strength')
0.707100591716
0.710059171598
0.763313609467
0.763313609467
('pos_vs_neg', 'pos_strength', 'number_number')
0.621301775148
0.621301775148
0.710059171598
0.715976331361
('pos_vs_neg', 'pos_vs_neg_strength', 'pos')
0.742603550296
0.742603550296
0.766272189349
0.766272189349
('pos_vs_neg', 'pos_vs_neg_strength', 'num_slang')
0.736686390533
0.736686390533
0.718934911243
0.718934911243
('pos_vs_neg', 'pos_vs_neg_streng



0.754437869822




0.757396449704
('pos_strength', 'num_slang', 'number_number')
0.621301775148
0.621301775148




0.633136094675




0.62426035503
('pos_strength', 'avg_word_len', 'neg')
0.647928994083
0.630177514793
0.715976331361
0.724852071006
('pos_strength', 'avg_word_len', 'neg_strength')
0.698224852071
0.701183431953
0.760355029586
0.760355029586
('pos_strength', 'avg_word_len', 'number_number')
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('pos_strength', 'neg', 'neg_strength')
0.701183431953
0.704142011834
0.786982248521
0.781065088757
('pos_strength', 'neg', 'number_number')
0.621301775148
0.621301775148
0.710059171598
0.715976331361
('pos_strength', 'neg_strength', 'number_number')
0.653846153846
0.627218934911




0.748520710059




0.748520710059
('pos_vs_neg_strength', 'pos', 'num_slang')
0.715976331361
0.721893491124
0.727810650888
0.724852071006
('pos_vs_neg_strength', 'pos', 'avg_word_len')
0.727810650888
0.730769230769
0.727810650888
0.736686390533
('pos_vs_neg_strength', 'pos', 'neg')
0.763313609467
0.769230769231
0.748520710059
0.769230769231
('pos_vs_neg_strength', 'pos', 'neg_strength')
0.701183431953
0.698224852071
0.757396449704
0.778106508876
('pos_vs_neg_strength', 'pos', 'number_number')
0.668639053254
0.633136094675
0.736686390533
0.715976331361
('pos_vs_neg_strength', 'num_slang', 'avg_word_len')
0.721893491124
0.736686390533
0.721893491124
0.721893491124
('pos_vs_neg_strength', 'num_slang', 'neg')
0.757396449704
0.754437869822
0.739644970414
0.733727810651
('pos_vs_neg_strength', 'num_slang', 'neg_strength')
0.692307692308
0.674556213018
0.730769230769
0.727810650888
('pos_vs_neg_strength', 'num_slang', 'number_number')
0.695266272189
0.668639053254
0.695266272189
0.695266272189
('pos_vs_neg_stre



0.674556213018




0.680473372781
('avg_word_len', 'neg', 'neg_strength')
0.674556213018
0.665680473373
0.724852071006
0.730769230769
('avg_word_len', 'neg', 'number_number')
0.647928994083
0.621301775148
0.701183431953
0.713017751479
('avg_word_len', 'neg_strength', 'number_number')
0.650887573964
0.630177514793
0.653846153846
0.653846153846
('neg', 'neg_strength', 'number_number')
0.668639053254
0.64201183432
0.707100591716
0.707100591716
('exclamation', 'question_mark', 'pos_vs_neg', 'pos_strength')
0.704142011834
0.704142011834
0.730769230769
0.739644970414
('exclamation', 'question_mark', 'pos_vs_neg', 'pos_vs_neg_strength')
0.742603550296
0.748520710059
0.754437869822
0.754437869822
('exclamation', 'question_mark', 'pos_vs_neg', 'pos')
0.745562130178
0.739644970414
0.742603550296
0.751479289941
('exclamation', 'question_mark', 'pos_vs_neg', 'num_slang')
0.745562130178
0.742603550296
0.745562130178
0.733727810651
('exclamation', 'question_mark', 'pos_vs_neg', 'avg_word_len')
0.736686390533
0.7455621



0.689349112426




0.674556213018
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len')
0.695266272189
0.680473372781
0.680473372781
0.698224852071
('exclamation', 'question_mark', 'pos_strength', 'neg')
0.653846153846
0.64201183432
0.739644970414
0.751479289941
('exclamation', 'question_mark', 'pos_strength', 'neg_strength')
0.707100591716
0.698224852071




0.778106508876




0.804733727811
('exclamation', 'question_mark', 'pos_strength', 'number_number')
0.633136094675
0.621301775148




0.686390532544




0.689349112426
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos')
0.707100591716
0.715976331361
0.751479289941
0.760355029586
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'num_slang')
0.718934911243
0.718934911243
0.695266272189
0.701183431953
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'avg_word_len')
0.733727810651
0.736686390533
0.704142011834
0.713017751479
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'neg')
0.748520710059
0.757396449704
0.784023668639
0.769230769231
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'neg_strength')
0.715976331361
0.718934911243
0.792899408284
0.798816568047
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'number_number')
0.683431952663
0.671597633136
0.698224852071
0.692307692308
('exclamation', 'question_mark', 'pos', 'num_slang')
0.683431952663
0.680473372781
0.674556213018
0.659763313609
('exclamation', 'question_mark', 'pos', 'avg_word_len')
0.692307692308
0.686390532544
0.668639053254



0.713017751479




0.718934911243
('exclamation', 'question_mark', 'num_slang', 'number_number')
0.647928994083
0.647928994083




0.692307692308




0.683431952663
('exclamation', 'question_mark', 'avg_word_len', 'neg')
0.695266272189
0.686390532544
0.733727810651
0.736686390533
('exclamation', 'question_mark', 'avg_word_len', 'neg_strength')
0.710059171598
0.704142011834
0.730769230769
0.733727810651
('exclamation', 'question_mark', 'avg_word_len', 'number_number')
0.659763313609
0.665680473373
0.674556213018
0.686390532544
('exclamation', 'question_mark', 'neg', 'neg_strength')
0.710059171598
0.715976331361
0.751479289941
0.751479289941
('exclamation', 'question_mark', 'neg', 'number_number')
0.650887573964
0.659763313609
0.718934911243
0.733727810651
('exclamation', 'question_mark', 'neg_strength', 'number_number')
0.647928994083
0.659763313609




0.710059171598




0.710059171598
('exclamation', 'pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength')
0.745562130178
0.748520710059
0.754437869822
0.766272189349
('exclamation', 'pos_vs_neg', 'pos_strength', 'pos')
0.713017751479
0.730769230769
0.733727810651
0.742603550296
('exclamation', 'pos_vs_neg', 'pos_strength', 'num_slang')
0.718934911243
0.710059171598
0.707100591716
0.713017751479
('exclamation', 'pos_vs_neg', 'pos_strength', 'avg_word_len')
0.733727810651
0.727810650888
0.724852071006
0.727810650888
('exclamation', 'pos_vs_neg', 'pos_strength', 'neg')
0.713017751479
0.733727810651
0.739644970414
0.751479289941
('exclamation', 'pos_vs_neg', 'pos_strength', 'neg_strength')
0.710059171598
0.713017751479
0.757396449704
0.760355029586
('exclamation', 'pos_vs_neg', 'pos_strength', 'number_number')
0.621301775148
0.621301775148
0.713017751479
0.710059171598
('exclamation', 'pos_vs_neg', 'pos_vs_neg_strength', 'pos')
0.733727810651
0.745562130178
0.742603550296
0.757396449704
('exclamation', 'pos_vs_



0.745562130178




0.748520710059
('exclamation', 'pos_strength', 'num_slang', 'number_number')
0.621301775148
0.621301775148




0.621301775148




0.621301775148
('exclamation', 'pos_strength', 'avg_word_len', 'neg')
0.653846153846
0.662721893491
0.718934911243
0.739644970414
('exclamation', 'pos_strength', 'avg_word_len', 'neg_strength')
0.704142011834
0.710059171598
0.754437869822
0.760355029586
('exclamation', 'pos_strength', 'avg_word_len', 'number_number')
0.621301775148
0.621301775148
0.621301775148
0.621301775148
('exclamation', 'pos_strength', 'neg', 'neg_strength')
0.698224852071
0.715976331361
0.757396449704
0.778106508876
('exclamation', 'pos_strength', 'neg', 'number_number')
0.621301775148
0.621301775148
0.721893491124
0.727810650888
('exclamation', 'pos_strength', 'neg_strength', 'number_number')
0.668639053254
0.659763313609




0.739644970414




0.754437869822
('exclamation', 'pos_vs_neg_strength', 'pos', 'num_slang')
0.715976331361
0.721893491124
0.715976331361
0.713017751479
('exclamation', 'pos_vs_neg_strength', 'pos', 'avg_word_len')
0.748520710059
0.748520710059
0.724852071006
0.730769230769
('exclamation', 'pos_vs_neg_strength', 'pos', 'neg')
0.751479289941
0.748520710059
0.742603550296
0.760355029586
('exclamation', 'pos_vs_neg_strength', 'pos', 'neg_strength')
0.718934911243
0.727810650888
0.742603550296
0.769230769231
('exclamation', 'pos_vs_neg_strength', 'pos', 'number_number')
0.680473372781
0.674556213018
0.707100591716
0.715976331361
('exclamation', 'pos_vs_neg_strength', 'num_slang', 'avg_word_len')
0.718934911243
0.724852071006
0.715976331361
0.710059171598
('exclamation', 'pos_vs_neg_strength', 'num_slang', 'neg')
0.718934911243
0.721893491124
0.733727810651
0.736686390533
('exclamation', 'pos_vs_neg_strength', 'num_slang', 'neg_strength')
0.707100591716
0.710059171598
0.704142011834
0.710059171598
('exclamati



0.686390532544




0.689349112426
('exclamation', 'avg_word_len', 'neg', 'neg_strength')
0.659763313609
0.668639053254
0.718934911243
0.721893491124
('exclamation', 'avg_word_len', 'neg', 'number_number')
0.627218934911
0.62426035503
0.698224852071
0.710059171598
('exclamation', 'avg_word_len', 'neg_strength', 'number_number')
0.689349112426
0.689349112426
0.668639053254
0.674556213018
('exclamation', 'neg', 'neg_strength', 'number_number')
0.683431952663
0.680473372781
0.698224852071
0.695266272189
('question_mark', 'pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength')
0.739644970414
0.730769230769
0.748520710059
0.754437869822
('question_mark', 'pos_vs_neg', 'pos_strength', 'pos')
0.704142011834
0.701183431953
0.751479289941
0.763313609467
('question_mark', 'pos_vs_neg', 'pos_strength', 'num_slang')
0.713017751479
0.721893491124
0.742603550296
0.739644970414
('question_mark', 'pos_vs_neg', 'pos_strength', 'avg_word_len')
0.745562130178
0.724852071006
0.733727810651
0.745562130178
('question_mark', 'pos_



0.775147928994




0.784023668639
('question_mark', 'pos_strength', 'num_slang', 'number_number')
0.633136094675
0.621301775148




0.686390532544




0.674556213018
('question_mark', 'pos_strength', 'avg_word_len', 'neg')
0.710059171598
0.695266272189
0.745562130178
0.754437869822
('question_mark', 'pos_strength', 'avg_word_len', 'neg_strength')
0.727810650888
0.707100591716
0.772189349112
0.784023668639
('question_mark', 'pos_strength', 'avg_word_len', 'number_number')
0.630177514793
0.621301775148
0.668639053254
0.680473372781
('question_mark', 'pos_strength', 'neg', 'neg_strength')
0.727810650888
0.701183431953
0.784023668639
0.786982248521
('question_mark', 'pos_strength', 'neg', 'number_number')
0.627218934911
0.621301775148
0.724852071006
0.721893491124
('question_mark', 'pos_strength', 'neg_strength', 'number_number')
0.662721893491
0.656804733728




0.760355029586




0.766272189349
('question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang')
0.715976331361
0.736686390533
0.757396449704
0.766272189349
('question_mark', 'pos_vs_neg_strength', 'pos', 'avg_word_len')
0.745562130178
0.748520710059
0.742603550296
0.745562130178
('question_mark', 'pos_vs_neg_strength', 'pos', 'neg')
0.766272189349
0.775147928994
0.792899408284
0.789940828402
('question_mark', 'pos_vs_neg_strength', 'pos', 'neg_strength')
0.724852071006
0.715976331361
0.786982248521
0.786982248521
('question_mark', 'pos_vs_neg_strength', 'pos', 'number_number')
0.677514792899
0.668639053254
0.754437869822
0.748520710059
('question_mark', 'pos_vs_neg_strength', 'num_slang', 'avg_word_len')
0.742603550296
0.736686390533
0.713017751479
0.739644970414
('question_mark', 'pos_vs_neg_strength', 'num_slang', 'neg')
0.751479289941
0.757396449704
0.772189349112
0.766272189349
('question_mark', 'pos_vs_neg_strength', 'num_slang', 'neg_strength')
0.721893491124
0.698224852071
0.751479289941
0.73668639



0.713017751479




0.715976331361
('question_mark', 'avg_word_len', 'neg', 'neg_strength')
0.715976331361
0.692307692308
0.751479289941
0.754437869822
('question_mark', 'avg_word_len', 'neg', 'number_number')
0.665680473373
0.659763313609
0.730769230769
0.745562130178
('question_mark', 'avg_word_len', 'neg_strength', 'number_number')
0.647928994083
0.662721893491
0.695266272189
0.707100591716
('question_mark', 'neg', 'neg_strength', 'number_number')
0.653846153846
0.656804733728
0.745562130178
0.745562130178
('pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'pos')
0.763313609467
0.775147928994
0.760355029586
0.775147928994
('pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'num_slang')
0.742603550296
0.751479289941
0.730769230769
0.748520710059
('pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'avg_word_len')
0.748520710059
0.751479289941
0.739644970414
0.736686390533
('pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'neg')
0.763313609467
0.778106508876
0.775147928994
0.769230769231
('pos_



0.751479289941




0.745562130178
('pos_strength', 'avg_word_len', 'neg', 'neg_strength')
0.710059171598
0.713017751479
0.754437869822
0.775147928994
('pos_strength', 'avg_word_len', 'neg', 'number_number')
0.621301775148
0.621301775148
0.713017751479
0.707100591716
('pos_strength', 'avg_word_len', 'neg_strength', 'number_number')
0.665680473373
0.64201183432
0.748520710059
0.742603550296
('pos_strength', 'neg', 'neg_strength', 'number_number')
0.665680473373
0.639053254438
0.754437869822
0.751479289941
('pos_vs_neg_strength', 'pos', 'num_slang', 'avg_word_len')
0.718934911243
0.727810650888
0.724852071006
0.724852071006
('pos_vs_neg_strength', 'pos', 'num_slang', 'neg')
0.754437869822
0.757396449704
0.742603550296
0.751479289941
('pos_vs_neg_strength', 'pos', 'num_slang', 'neg_strength')
0.704142011834
0.686390532544
0.730769230769
0.733727810651
('pos_vs_neg_strength', 'pos', 'num_slang', 'number_number')
0.701183431953
0.668639053254
0.713017751479
0.727810650888
('pos_vs_neg_strength', 'pos', 'avg_wo



0.775147928994




0.789940828402
('exclamation', 'question_mark', 'pos_strength', 'num_slang', 'number_number')
0.639053254438
0.636094674556




0.677514792899




0.680473372781
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len', 'neg')
0.683431952663
0.671597633136
0.754437869822
0.757396449704
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len', 'neg_strength')
0.713017751479
0.710059171598
0.781065088757
0.789940828402
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len', 'number_number')
0.650887573964
0.621301775148
0.677514792899
0.680473372781
('exclamation', 'question_mark', 'pos_strength', 'neg', 'neg_strength')
0.715976331361
0.713017751479
0.778106508876
0.778106508876
('exclamation', 'question_mark', 'pos_strength', 'neg', 'number_number')
0.647928994083
0.621301775148
0.721893491124
0.713017751479
('exclamation', 'question_mark', 'pos_strength', 'neg_strength', 'number_number')
0.674556213018
0.668639053254




0.769230769231




0.786982248521
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang')
0.715976331361
0.718934911243
0.742603550296
0.730769230769
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'avg_word_len')
0.739644970414
0.739644970414
0.727810650888
0.736686390533
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'neg')
0.739644970414
0.754437869822
0.789940828402
0.795857988166
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'neg_strength')
0.710059171598
0.727810650888
0.784023668639
0.792899408284
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'number_number')
0.689349112426
0.668639053254
0.736686390533
0.736686390533
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'num_slang', 'avg_word_len')
0.739644970414
0.733727810651
0.707100591716
0.727810650888
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'num_slang', 'neg')
0.748520710059
0.727810650888
0.760355029586
0.763313609467
('exclamation', 'que



0.701183431953




0.707100591716
('exclamation', 'question_mark', 'avg_word_len', 'neg', 'neg_strength')
0.710059171598
0.707100591716
0.754437869822
0.760355029586
('exclamation', 'question_mark', 'avg_word_len', 'neg', 'number_number')
0.665680473373
0.668639053254
0.727810650888
0.742603550296
('exclamation', 'question_mark', 'avg_word_len', 'neg_strength', 'number_number')
0.650887573964
0.650887573964
0.707100591716
0.704142011834
('exclamation', 'question_mark', 'neg', 'neg_strength', 'number_number')
0.653846153846
0.656804733728
0.745562130178
0.751479289941
('exclamation', 'pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'pos')
0.742603550296
0.745562130178
0.751479289941
0.769230769231
('exclamation', 'pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'num_slang')
0.751479289941
0.739644970414
0.713017751479
0.733727810651
('exclamation', 'pos_vs_neg', 'pos_strength', 'pos_vs_neg_strength', 'avg_word_len')
0.760355029586
0.766272189349
0.730769230769
0.733727810651
('exclamation', 'pos_v



0.742603550296




0.739644970414
('exclamation', 'pos_strength', 'avg_word_len', 'neg', 'neg_strength')
0.713017751479
0.715976331361
0.754437869822
0.760355029586
('exclamation', 'pos_strength', 'avg_word_len', 'neg', 'number_number')
0.621301775148
0.621301775148
0.718934911243
0.733727810651
('exclamation', 'pos_strength', 'avg_word_len', 'neg_strength', 'number_number')
0.677514792899
0.662721893491
0.751479289941
0.760355029586
('exclamation', 'pos_strength', 'neg', 'neg_strength', 'number_number')
0.677514792899
0.668639053254
0.748520710059
0.757396449704
('exclamation', 'pos_vs_neg_strength', 'pos', 'num_slang', 'avg_word_len')
0.713017751479
0.721893491124
0.713017751479
0.718934911243
('exclamation', 'pos_vs_neg_strength', 'pos', 'num_slang', 'neg')
0.715976331361
0.730769230769
0.733727810651
0.748520710059
('exclamation', 'pos_vs_neg_strength', 'pos', 'num_slang', 'neg_strength')
0.710059171598
0.710059171598
0.721893491124
0.730769230769
('exclamation', 'pos_vs_neg_strength', 'pos', 'num_sl



0.748520710059




0.757396449704
('question_mark', 'pos_strength', 'avg_word_len', 'neg', 'neg_strength')
0.736686390533
0.704142011834
0.789940828402
0.789940828402
('question_mark', 'pos_strength', 'avg_word_len', 'neg', 'number_number')
0.62426035503
0.621301775148
0.742603550296
0.745562130178
('question_mark', 'pos_strength', 'avg_word_len', 'neg_strength', 'number_number')
0.668639053254
0.659763313609
0.757396449704
0.766272189349
('question_mark', 'pos_strength', 'neg', 'neg_strength', 'number_number')
0.671597633136
0.662721893491
0.760355029586
0.763313609467
('question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang', 'avg_word_len')
0.742603550296
0.736686390533
0.730769230769
0.742603550296
('question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang', 'neg')
0.742603550296
0.739644970414
0.769230769231
0.781065088757
('question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang', 'neg_strength')
0.715976331361
0.710059171598
0.760355029586
0.772189349112
('question_mark', 'pos_vs_neg_strength',



0.757396449704




0.760355029586
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len', 'neg', 'neg_strength')
0.727810650888
0.715976331361
0.786982248521
0.792899408284
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len', 'neg', 'number_number')
0.639053254438
0.621301775148
0.745562130178
0.754437869822
('exclamation', 'question_mark', 'pos_strength', 'avg_word_len', 'neg_strength', 'number_number')
0.677514792899
0.677514792899
0.757396449704
0.778106508876
('exclamation', 'question_mark', 'pos_strength', 'neg', 'neg_strength', 'number_number')
0.683431952663
0.677514792899
0.751479289941
0.763313609467
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang', 'avg_word_len')
0.730769230769
0.724852071006
0.733727810651
0.742603550296
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang', 'neg')
0.733727810651
0.727810650888
0.769230769231
0.778106508876
('exclamation', 'question_mark', 'pos_vs_neg_strength', 'pos', 'num_slang', 'neg_stren

In [70]:
clf.predict_proba(list(data['feature'][0].values()))



array([[ 0.26056572,  0.73943428]])

### AdaBoost

In [77]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_estimators = [DecisionTreeClassifier(max_depth=1),
                   DecisionTreeClassifier(max_depth=2),
                   DecisionTreeClassifier(max_depth=3)]
n_estimators = np.linspace(1,20,10).astype(int)
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(learning_rate = 0.1)

In [None]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(bdt,subset_feature,data['sentiment'], output_file = 'data/adaboost_classifier_selection.csv',
                param_grid=dict(base_estimator=base_estimators,n_estimators=n_estimators))
    grid_search(bdt,subset_feature,data['sentiment'], output_file = 'data/adaboost_classifier_selection.csv',
                param_grid=dict(base_estimator=base_estimators,n_estimators=n_estimators),feature_scaled=True)

### Logistic Regression

In [18]:
from sklearn import linear_model
Cs = np.logspace(-1, 2, 20)
lr = linear_model.LogisticRegression()

In [26]:
grid_search(lr,data['feature'],data['sentiment'], dict(C=Cs),output=False)

0.778106508876
{'C': 0.10000000000000001}


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([   0.1    ,    0.14384,    0.20691,    0.29764,    0.42813,
          0.61585,    0.88587,    1.27427,    1.83298,    2.63665,
          3.79269,    5.45559,    7.8476 ,   11.28838,   16.23777,
         23.35721,   33.59818,   48.3293 ,   69.51928,  100.     ])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [None]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(lr,subset_feature,data['sentiment'], dict(C=Cs), output_file = 'data/logistic_classifier_selection.csv')
    grid_search(lr,subset_feature,data['sentiment'], dict(C=Cs),
                output_file = 'data/logistic_classifier_selection.csv', feature_scaled=True)

### Naive Bayes

In [29]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()

In [30]:
grid_search(GNB,data['feature'],data['sentiment'], {},output=False)

0.689349112426
{}


GridSearchCV(cv=10, error_score='raise', estimator=GaussianNB(),
       fit_params={}, iid=True, n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [None]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(GNB,subset_feature,data['sentiment'], {}, output_file = 'data/naive_bayes_classifier_selection.csv')
    grid_search(GNB,subset_feature,data['sentiment'], {},
                output_file = 'data/naive_bayes_classifier_selection.csv', feature_scaled=True)

In [55]:
from sklearn.naive_bayes import MultinomialNB
alphas = np.logspace(-3, 1, 20)
MNB = MultinomialNB()

In [56]:
grid_search(MNB,data['feature'],data['sentiment'], dict(alpha=alphas),minmax=True,feature_scaled=True,output=False)

0.647928994083
{'alpha': 0.001}


GridSearchCV(cv=10, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'alpha': array([  1.00000e-03,   1.83298e-03,   3.35982e-03,   6.15848e-03,
         1.12884e-02,   2.06914e-02,   3.79269e-02,   6.95193e-02,
         1.27427e-01,   2.33572e-01,   4.28133e-01,   7.84760e-01,
         1.43845e+00,   2.63665e+00,   4.83293e+00,   8.85867e+00,
         1.62378e+01,   2.97635e+01,   5.45559e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [None]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(MNB,subset_feature,data['sentiment'], dict(alpha=alphas), minmax=True,
                output_file = 'data/naive_bayes_classifier_selection.csv')
    grid_search(MNB,subset_feature,data['sentiment'], dict(alpha=alphas),
                output_file = 'data/naive_bayes_classifier_selection.csv', minmax=True,feature_scaled=True)

### Decision Tree

In [66]:
from sklearn import tree
max_depths = np.arange(1,len(data['feature'][0]))
dt = tree.DecisionTreeClassifier()

In [67]:
grid_search(dt,data['feature'],data['sentiment'], dict(max_depth=max_depths),output=False)

0.751479289941
{'max_depth': 3}


GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [68]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(dt,subset_feature,data['sentiment'], dict(max_depth=max_depths), output=False)

('num_slang',)
0.621301775148
{'max_depth': 5}
('question_mark',)
0.677514792899
{'max_depth': 1}
('neg',)
0.689349112426
{'max_depth': 2}
('pos',)
0.621301775148
{'max_depth': 1}
('number_number',)
0.618343195266
{'max_depth': 2}
('pos_strength',)
0.630177514793
{'max_depth': 4}
('neg_strength',)
0.653846153846
{'max_depth': 1}
('pos_vs_neg_strength',)
0.742603550296
{'max_depth': 2}
('pos_vs_neg',)
0.721893491124
{'max_depth': 1}
('exclamation',)
0.621301775148
{'max_depth': 1}
('avg_word_len',)
0.621301775148
{'max_depth': 1}
('num_slang', 'question_mark')
0.692307692308
{'max_depth': 3}
('num_slang', 'neg')
0.689349112426
{'max_depth': 2}
('num_slang', 'pos')
0.621301775148
{'max_depth': 1}
('num_slang', 'number_number')
0.615384615385
{'max_depth': 1}
('num_slang', 'pos_strength')
0.633136094675
{'max_depth': 5}
('num_slang', 'neg_strength')
0.653846153846
{'max_depth': 1}
('num_slang', 'pos_vs_neg_strength')
0.742603550296
{'max_depth': 2}
('num_slang', 'pos_vs_neg')
0.7278106508

Process ForkPoolWorker-1116:
Process ForkPoolWorker-1113:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/loc

KeyboardInterrupt: 

In [None]:
for subset in subsets:
    subset_feature = []
    for feature in data['feature']:
        subset_feature.append({k:feature[k] for k in subset})
    subset_feature = np.array(subset_feature)
    print(subset)
    grid_search(dt,subset_feature,data['sentiment'], {}, output_file = 'data/decision_tree_classifier_selection.csv')
    grid_search(dt,subset_feature,data['sentiment'], {},
                output_file = 'data/decision_tree_classifier_selection.csv', feature_scaled=True)