In [6]:
from sklearn import feature_extraction
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
def load_file(filename):

    X = []
    y = []

    with open(filename) as f:
        for i,line in enumerate(f):
            index = 0
            indices = []
            for char in line:
                if char == ',':
                    indices.append(index)
                    break
                index += 1
            value = line[0:index]
            tag = ''
            if float(value) >= 0.5:
                tag = 'pos'
            else:
                tag = 'neg'

            text = line[index:]
            y.append(tag)
            X.append(text)
    
    return X,y


In [18]:
X_train, y_train = load_file('sst_train_phrases.csv')
X_dev, y_dev = load_file('sst_dev.csv')
X_test, y_test = load_file('sst_test.csv')

In [19]:
vectorizer = feature_extraction.text.CountVectorizer(ngram_range = (1,1), stop_words = 'english', binary = True)
vectorizer_freq = TfidfVectorizer(ngram_range = (1,1), stop_words = 'english')
vectorizer.fit(X_train + X_dev + X_test)
vectorizer_freq.fit(X_train + X_dev + X_test)

#bag of words vectors
X_train = (vectorizer.transform(X_train))
X_dev = (vectorizer.transform(X_dev))
X_test = (vectorizer.transform(X_test))

# frequency bag of words vectors
#X_train_fbow = (vectorizer_freq.transform(X_train)).toarray()
#X_dev_fbow = (vectorizer_freq.transform(X_dev)).toarray()
#X_test_fbow = (vectorizer_freq.transform(X_test)).toarray()


In [32]:
alpha_list = np.linspace(1e-5, 1, 1000)
best_alpha = 0
best_score = 0
num_iterations = len(alpha_list)

for alpha in alpha_list:
    
    clf = LogisticRegression(random_state=0, solver='liblinear', C = alpha, max_iter = 1000, dual = False, penalty = 'l1')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_dev)
    score = metrics.accuracy_score(y_dev, y_pred)
    
    if best_score < score:
        best_score = score
        best_alpha = alpha
        
    print('Score: ' + str(score) + ' Current_Best: ' + str(best_score) + ' Left: ' + str(num_iterations))
    num_iterations -= 1

Score: 0.4890909090909091 Current_Best: 0.4890909090909091 Left: 1000
Score: 0.5109090909090909 Current_Best: 0.5109090909090909 Left: 999
Score: 0.5118181818181818 Current_Best: 0.5118181818181818 Left: 998
Score: 0.5272727272727272 Current_Best: 0.5272727272727272 Left: 997
Score: 0.5327272727272727 Current_Best: 0.5327272727272727 Left: 996
Score: 0.5345454545454545 Current_Best: 0.5345454545454545 Left: 995
Score: 0.5409090909090909 Current_Best: 0.5409090909090909 Left: 994
Score: 0.5445454545454546 Current_Best: 0.5445454545454546 Left: 993
Score: 0.5527272727272727 Current_Best: 0.5527272727272727 Left: 992
Score: 0.5554545454545454 Current_Best: 0.5554545454545454 Left: 991
Score: 0.5627272727272727 Current_Best: 0.5627272727272727 Left: 990
Score: 0.5727272727272728 Current_Best: 0.5727272727272728 Left: 989
Score: 0.5754545454545454 Current_Best: 0.5754545454545454 Left: 988
Score: 0.5872727272727273 Current_Best: 0.5872727272727273 Left: 987
Score: 0.5954545454545455 Current

Score: 0.7181818181818181 Current_Best: 0.7181818181818181 Left: 879
Score: 0.7181818181818181 Current_Best: 0.7181818181818181 Left: 878
Score: 0.7181818181818181 Current_Best: 0.7181818181818181 Left: 877
Score: 0.7181818181818181 Current_Best: 0.7181818181818181 Left: 876
Score: 0.72 Current_Best: 0.72 Left: 875
Score: 0.72 Current_Best: 0.72 Left: 874
Score: 0.7209090909090909 Current_Best: 0.7209090909090909 Left: 873
Score: 0.7209090909090909 Current_Best: 0.7209090909090909 Left: 872
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 871
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 870
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 869
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 868
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 867
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 866
Score: 0.7190909090909091 Current_Best: 0.7209090909090909 Left: 865
Score: 0.719090909090

Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 756
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 755
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 754
Score: 0.72 Current_Best: 0.7236363636363636 Left: 753
Score: 0.72 Current_Best: 0.7236363636363636 Left: 752
Score: 0.72 Current_Best: 0.7236363636363636 Left: 751
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 750
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 749
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 748
Score: 0.72 Current_Best: 0.7236363636363636 Left: 747
Score: 0.72 Current_Best: 0.7236363636363636 Left: 746
Score: 0.72 Current_Best: 0.7236363636363636 Left: 745
Score: 0.72 Current_Best: 0.7236363636363636 Left: 744
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 743
Score: 0.7209090909090909 Current_Best: 0.7236363636363636 Left: 742
Score: 0.72 Current_Best: 0.7236363636363636 Left: 741
Score: 0

Score: 0.7281818181818182 Current_Best: 0.730909090909091 Left: 631
Score: 0.7281818181818182 Current_Best: 0.730909090909091 Left: 630
Score: 0.7281818181818182 Current_Best: 0.730909090909091 Left: 629
Score: 0.7281818181818182 Current_Best: 0.730909090909091 Left: 628
Score: 0.7272727272727273 Current_Best: 0.730909090909091 Left: 627
Score: 0.7272727272727273 Current_Best: 0.730909090909091 Left: 626
Score: 0.7263636363636363 Current_Best: 0.730909090909091 Left: 625
Score: 0.7263636363636363 Current_Best: 0.730909090909091 Left: 624
Score: 0.7263636363636363 Current_Best: 0.730909090909091 Left: 623
Score: 0.7263636363636363 Current_Best: 0.730909090909091 Left: 622
Score: 0.7263636363636363 Current_Best: 0.730909090909091 Left: 621
Score: 0.7263636363636363 Current_Best: 0.730909090909091 Left: 620
Score: 0.7272727272727273 Current_Best: 0.730909090909091 Left: 619
Score: 0.7272727272727273 Current_Best: 0.730909090909091 Left: 618
Score: 0.7272727272727273 Current_Best: 0.730909

Score: 0.7318181818181818 Current_Best: 0.7345454545454545 Left: 507
Score: 0.7318181818181818 Current_Best: 0.7345454545454545 Left: 506
Score: 0.7318181818181818 Current_Best: 0.7345454545454545 Left: 505
Score: 0.7318181818181818 Current_Best: 0.7345454545454545 Left: 504
Score: 0.7318181818181818 Current_Best: 0.7345454545454545 Left: 503
Score: 0.7318181818181818 Current_Best: 0.7345454545454545 Left: 502
Score: 0.7336363636363636 Current_Best: 0.7345454545454545 Left: 501
Score: 0.7336363636363636 Current_Best: 0.7345454545454545 Left: 500
Score: 0.7336363636363636 Current_Best: 0.7345454545454545 Left: 499
Score: 0.7336363636363636 Current_Best: 0.7345454545454545 Left: 498
Score: 0.7336363636363636 Current_Best: 0.7345454545454545 Left: 497
Score: 0.7345454545454545 Current_Best: 0.7345454545454545 Left: 496
Score: 0.7345454545454545 Current_Best: 0.7345454545454545 Left: 495
Score: 0.7354545454545455 Current_Best: 0.7354545454545455 Left: 494
Score: 0.7354545454545455 Current_

Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 388
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 387
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 386
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 385
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 384
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 383
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 382
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 381
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 380
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 379
Score: 0.7354545454545455 Current_Best: 0.7372727272727273 Left: 378
Score: 0.7363636363636363 Current_Best: 0.7372727272727273 Left: 377
Score: 0.7363636363636363 Current_Best: 0.7372727272727273 Left: 376
Score: 0.7363636363636363 Current_Best: 0.7372727272727273 Left: 375
Score: 0.7363636363636363 Current_

Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 269
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 268
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 267
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 266
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 265
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 264
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 263
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 262
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 261
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 260
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 259
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 258
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 257
Score: 0.7390909090909091 Current_Best: 0.7390909090909091 Left: 256
Score: 0.7390909090909091 Current_

Score: 0.74 Current_Best: 0.7427272727272727 Left: 143
Score: 0.74 Current_Best: 0.7427272727272727 Left: 142
Score: 0.74 Current_Best: 0.7427272727272727 Left: 141
Score: 0.74 Current_Best: 0.7427272727272727 Left: 140
Score: 0.74 Current_Best: 0.7427272727272727 Left: 139
Score: 0.74 Current_Best: 0.7427272727272727 Left: 138
Score: 0.74 Current_Best: 0.7427272727272727 Left: 137
Score: 0.74 Current_Best: 0.7427272727272727 Left: 136
Score: 0.74 Current_Best: 0.7427272727272727 Left: 135
Score: 0.74 Current_Best: 0.7427272727272727 Left: 134
Score: 0.74 Current_Best: 0.7427272727272727 Left: 133
Score: 0.74 Current_Best: 0.7427272727272727 Left: 132
Score: 0.740909090909091 Current_Best: 0.7427272727272727 Left: 131
Score: 0.740909090909091 Current_Best: 0.7427272727272727 Left: 130
Score: 0.740909090909091 Current_Best: 0.7427272727272727 Left: 129
Score: 0.740909090909091 Current_Best: 0.7427272727272727 Left: 128
Score: 0.740909090909091 Current_Best: 0.7427272727272727 Left: 127


Score: 0.74 Current_Best: 0.7436363636363637 Left: 14
Score: 0.74 Current_Best: 0.7436363636363637 Left: 13
Score: 0.74 Current_Best: 0.7436363636363637 Left: 12
Score: 0.74 Current_Best: 0.7436363636363637 Left: 11
Score: 0.74 Current_Best: 0.7436363636363637 Left: 10
Score: 0.74 Current_Best: 0.7436363636363637 Left: 9
Score: 0.74 Current_Best: 0.7436363636363637 Left: 8
Score: 0.74 Current_Best: 0.7436363636363637 Left: 7
Score: 0.74 Current_Best: 0.7436363636363637 Left: 6
Score: 0.74 Current_Best: 0.7436363636363637 Left: 5
Score: 0.74 Current_Best: 0.7436363636363637 Left: 4
Score: 0.7390909090909091 Current_Best: 0.7436363636363637 Left: 3
Score: 0.7390909090909091 Current_Best: 0.7436363636363637 Left: 2
Score: 0.7390909090909091 Current_Best: 0.7436363636363637 Left: 1


In [33]:
print('Best alpha: ' + str(best_alpha) + '     best_score: ' + str(best_score))
clf = LogisticRegression(random_state=0, solver='liblinear', C = best_alpha, penalty = 'l1', dual = False)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print('Score: ' + str(score))

Best alpha: 0.9189197297297298     best_score: 0.7436363636363637
Score: 0.7447963800904978


In [None]:
#valid_score: 0.738, test_score: 0.747 solver = 'lbfgs', dual = False, best_alpha = 0.4747527
#valid_score: 0.738, test_score = 0.748 solver = newton-cg, fual = False, best_alpha = 0.4747527
#valid_score: 0.738, test_score = 0.748 solver = sag, fual = False, best_alpha = 0.4747527