### Build training data

In [14]:
import random
rating = [1, 2, 3, 4, 5]
total_data = []
for r in rating:
    filename = 'data/Office_Products_5.json.gz_' + str(r) + '.0.txt'
    with open(filename, 'r') as f:
        for line in f:
            total_data.append((line.strip('\n'), r))
random.shuffle(total_data)

In [15]:
train_data = total_data[:int(0.7*len(total_data))]
dev_data = total_data[int(0.7*len(total_data)):int(0.85*len(total_data))]
test_data = total_data[int(0.85*len(total_data)):]

In [16]:
print(len(train_data))
print(len(dev_data))
print(len(test_data))

503099
107807
107808


In [17]:
with open('rating_train.tsv', 'w') as f:
    for d in train_data:
        f.write(d[0] + '\t' + str(d[1]) + '\n')
with open('rating_dev.tsv', 'w') as f:
    for d in dev_data:
        f.write(d[0] + '\t' + str(d[1]) + '\n')
with open('rating_test.tsv', 'w') as f:
    for d in test_data:
        f.write(d[0] + '\t' + str(d[1]) + '\n')

### Generate train data with balanced label count

In [22]:
r4 = []
r5 = []
r_other = []
for d in train_data:
    if d[1] == 4:
        r4.append(d)
    elif d[1] == 5:
        r5.append(d)
    else:
        r_other.append(d)
b_train_data = r_other + r4[:50000] + r5[:50000]
random.shuffle(b_train_data)

In [23]:
print(len(b_train_data))

164533


In [25]:
tr4 = []
tr5 = []
tr_other = []
for d in test_data:
    if d[1] == 4:
        tr4.append(d)
    elif d[1] == 5:
        tr5.append(d)
    else:
        tr_other.append(d)
b_test_data = tr_other + tr4[:10000] + tr5[:10000]
random.shuffle(b_test_data)

In [26]:
print(len(b_test_data))

33746


In [27]:
dr4 = []
dr5 = []
dr_other = []
for d in dev_data:
    if d[1] == 4:
        dr4.append(d)
    elif d[1] == 5:
        dr5.append(d)
    else:
        dr_other.append(d)
b_dev_data = dr_other + dr4[:10000] + dr5[:10000]
random.shuffle(b_dev_data)

In [28]:
with open('balanced_rating_train.tsv', 'w') as f:
    for d in b_train_data:
        f.write(d[0] + '\t' + str(d[1]) + '\n')
with open('balanced_rating_dev.tsv', 'w') as f:
    for d in b_dev_data:
        f.write(d[0] + '\t' + str(d[1]) + '\n')
with open('balanced_rating_test.tsv', 'w') as f:
    for d in b_test_data:
        f.write(d[0] + '\t' + str(d[1]) + '\n')

### Fit into linear regression

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer

corpus = []
labels = []
with open('balanced_rating_train.tsv', 'r') as f:
    for line in f:
        sent, label = line.strip('\n').split('\t')
        corpus.append(sent)
        labels.append(int(label))
        
test_corpus = []
test_labels = []
with open('balanced_rating_test.tsv', 'r') as f:
    for line in f:
        sent, label = line.strip('\n').split('\t')
        test_corpus.append(sent)
        test_labels.append(int(label))

In [30]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus+test_corpus)

In [31]:
train_X = X[:len(corpus)]
test_X = X[len(corpus):]

In [32]:
reg = LinearRegression().fit(train_X, labels)

In [33]:
pred = reg.predict(test_X)

In [12]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [34]:
print(MSE(pred, test_labels))

1.5040725130698294


### Ratio in different rating

In [21]:
c1 = 0
c2 = 0
c3 = 0
c4 = 0
c5 = 0
for d in train_data:
    if d[1] == 1:
        c1 += 1
    elif d[1] == 2:
        c2 += 1
    elif d[1] == 3:
        c3 += 1
    elif d[1] == 4:
        c4 += 1
    elif d[1] == 5:
        c5 += 1
print(c1/len(train_data))
print(c2/len(train_data))
print(c3/len(train_data))
print(c4/len(train_data))
print(c5/len(train_data))

0.037861335442924755
0.028793537653622844
0.06161610339118146
0.14883551746276577
0.7228935060495052


In [24]:
c1 = 0
c2 = 0
c3 = 0
c4 = 0
c5 = 0
for d in test_data:
    if d[1] == 1:
        c1 += 1
    elif d[1] == 2:
        c2 += 1
    elif d[1] == 3:
        c3 += 1
    elif d[1] == 4:
        c4 += 1
    elif d[1] == 5:
        c5 += 1
print(c1/len(test_data))
print(c2/len(test_data))
print(c3/len(test_data))
print(c4/len(test_data))
print(c5/len(test_data))

0.03831812110418522
0.028393069159988127
0.060793262095577325
0.1492653606411398
0.7232301869991096
