In [1]:
import string
sent_list = []
labels = []
with open('balance_train.tsv', 'r') as f:
    for line in f:
        sent, label = line.strip('\n').split('\t')
        labels.append(int(label))
        words = sent.split(' ')
        pure_words = []
        for w in words:
            if w not in string.punctuation:
                pure_words.append(w)
        sent_list.append(pure_words)

In [2]:
from collections import defaultdict
unigram = defaultdict(int)
for sent in sent_list:
    for w in sent:
        unigram[w] += 1
unigram = sorted(unigram.items(), key=lambda x:x[1], reverse=True)

In [3]:
top1000_unigram = unigram[:1000]
uniwords = [i[0] for i in top1000_unigram]
uniwordId = dict(zip(uniwords, range(len(uniwords))))

In [4]:
train_X = []

for sent in sent_list:
    feat = [0]*len(uniwords)
    for word in sent:
        if word in uniwords:
            feat[uniwordId[word]] += 1
    train_X.append(feat)

In [5]:
test_sent_list = []
test_labels = []
with open('balance_test.tsv', 'r') as f:
    for line in f:
        sent, label = line.strip('\n').split('\t')
        test_labels.append(int(label))
        words = sent.split(' ')
        pure_words = []
        for w in words:
            if w not in string.punctuation:
                pure_words.append(w)
        test_sent_list.append(pure_words)

In [6]:
test_X = []

for sent in test_sent_list:
    feat = [0]*len(uniwords)
    for word in sent:
        if word in uniwords:
            feat[uniwordId[word]] += 1
    test_X.append(feat)

In [7]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(train_X, labels)



In [8]:
pred = clf.predict(test_X)

In [9]:
correct = 0
for y, p in zip(test_labels, list(pred)):
    if y == p:
        correct += 1
acc = correct/len(test_labels)
print(acc)

0.8271887725421757


### Add text feature

In [10]:
theta = clf.coef_[0]

In [11]:
weights = list(zip(theta, uniwords + ['constant_feat']))

In [12]:
weights.sort()

In [13]:
print(weights[:10])

[(-2.336391963979517, 'poor'), (-2.2532475675410786, 'worst'), (-2.2213996861710847, 'unus'), (-2.1981681125821213, 'unfortun'), (-2.1750164675562154, 'cheapli'), (-2.166248433666351, 'useless'), (-2.0791677960710104, 'horribl'), (-2.03688669795009, 'disappoint'), (-2.032843007799819, 'return'), (-1.8454287599999812, 'terribl')]


In [14]:
print(weights[-10:])

[(1.427200113665973, 'great'), (1.4401345515465582, 'glad'), (1.5381626415730898, 'perfectli'), (1.5466560087928187, 'amaz'), (1.6884581208756244, 'beat'), (1.7362669972592484, 'fantast'), (1.8027898034985426, 'perfect'), (1.8748171641454414, 'awesom'), (1.8820758390998003, 'highli'), (1.88518610059358, 'excel')]


In [15]:
positiveWords = []
negativeWords = []
for item in weights[:10]:
    w = item[1]
    negativeWords.append(w)
for item in weights[-10:]:
    w = item[1]
    positiveWords.append(w)

In [16]:
posId = dict(zip(positiveWords, range(len(positiveWords))))
negId = dict(zip(negativeWords, range(len(negativeWords))))

In [17]:
new_train_X = []
for sent, vector in zip(sent_list, train_X):
    posVec = [0]*len(positiveWords)
    negVec = [0]*len(negativeWords)
    posAppear = set()
    negAppear = set()
    for w in sent:
        if w in positiveWords:
            posVec[posId[w]] += 1
            posAppear.add(w)
        if w in negativeWords:
            negVec[negId[w]] += 1
            negAppear.add(w)
    posFreq = len(posAppear)/len(positiveWords)
    negFreq = len(negAppear)/len(negativeWords)
    new_v = vector + posVec + [posFreq] + negVec + [negFreq]
    new_train_X.append(new_v)

In [18]:
new_test_X = []
for sent, vector in zip(test_sent_list, test_X):
    posVec = [0]*len(positiveWords)
    negVec = [0]*len(negativeWords)
    posAppear = set()
    negAppear = set()
    for w in sent:
        if w in positiveWords:
            posVec[posId[w]] += 1
            posAppear.add(w)
        if w in negativeWords:
            negVec[negId[w]] += 1
            negAppear.add(w)
    posFreq = len(posAppear)/len(positiveWords)
    negFreq = len(negAppear)/len(negativeWords)
    new_v = vector + posVec + [posFreq] + negVec + [negFreq]
    new_test_X.append(new_v)

In [19]:
clf = LogisticRegression().fit(new_train_X, labels)



In [20]:
pred = clf.predict(new_test_X)

In [21]:
correct = 0
for y, p in zip(test_labels, list(pred)):
    if y == p:
        correct += 1
acc = correct/len(test_labels)
print(acc)

0.8273342059336823


### Add avg number of words in a sentence

In [22]:
train_avgword_count = []
with open('balance_train.tsv', 'r') as f:
    for line in f:
        sent, _ = line.strip('\n').split('\t')
        words = sent.split(' ')
        cur_word_c = 0
        cur_s_c = 0
        for w in words:
            if w == '.' or w == '?' or w == '!' or w == ',':
                cur_s_c += 1
            else:
                cur_word_c += 1
        if sent != "" and cur_s_c == 0:
            cur_s_c = 1
        if sent == "":
            train_avgword_count.append(0)
        else:
            train_avgword_count.append(cur_word_c/cur_s_c)

In [23]:
new_train_X2 = []
for sent_c, vector in zip(train_avgword_count, new_train_X):
    new_v = vector + [sent_c]
    new_train_X2.append(new_v)

In [24]:
test_avgword_count = []
with open('balance_test.tsv', 'r') as f:
    for line in f:
        sent, _ = line.strip('\n').split('\t')
        words = sent.split(' ')
        cur_word_c = 0
        cur_s_c = 0
        for w in words:
            if w == '.' or w == '?' or w == '!' or w == ',':
                cur_s_c += 1
            else:
                cur_word_c += 1
        if sent != "" and cur_s_c == 0:
            cur_s_c = 1
        if sent == "":
            test_avgword_count.append(0)
        else:
            test_avgword_count.append(cur_word_c/cur_s_c)

In [25]:
new_test_X2 = []
for sent_c, vector in zip(test_avgword_count, new_test_X):
    new_v = vector + [sent_c]
    new_test_X2.append(new_v)

In [26]:
clf = LogisticRegression().fit(new_train_X2, labels)



In [27]:
pred = clf.predict(new_test_X2)

In [28]:
correct = 0
for y, p in zip(test_labels, list(pred)):
    if y == p:
        correct += 1
acc = correct/len(test_labels)
print(acc)

0.8263161721931356
