In [30]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import json

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [4]:
def read_data_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [11]:
dataset = read_data_json('../data/final_combined_replicate.json')
ids = list(dataset.keys())
print(len(ids))

32532


In [13]:
valid_ids = read_data_json('../data/valid_ids.json')
print(len(valid_ids))

6506


In [15]:
test_ids = []
with open('../data/id_ans_test', 'r') as f:
    for line in f:
        test_id = line.strip().split('\t')[0]
        test_ids.append(test_id)
print(len(test_ids))

6506


In [16]:
train_ids = []
for i in ids:
    if i not in test_ids and i not in valid_ids:
        train_ids.append(i)
print(len(train_ids))

21848


In [10]:
dataset['0']

{'text': '镇海 雅乐 学校 二年级 的 小朋友 到 一条 小路 的 一边 植树 ． 小朋友 们 每隔 temp_a 米 种 一棵树 （ 马路 两头 都 种 了 树 ） ， 最后 发现 一共 种 了 temp_b 棵 ， 这 条 小路 长 多少 米 ．',
 'target_template': ['x', '=', '(', 'temp_b', '-', '1', ')', '*', 'temp_a'],
 'gen_template': [],
 'num_list': ['2', '11'],
 'ans': '20',
 'numtemp_order': ['temp_a', 'temp_b'],
 'num_position': [16, 34],
 'post_template': ['temp_b', '1', '-', 'temp_a', '*']}

In [19]:
def get_x_y(ids):
    x = []
    y = []
    for id in ids:
        x.append(dataset[id]['text'])
        y.append(dataset[id]['ans'])
    return x,y

In [28]:
x_train, y_train = get_x_y(train_ids)

In [29]:
x_test, y_test = get_x_y(test_ids)
x_val, y_val = get_x_y(valid_ids)

In [None]:
def get_accuracy(y1, y2):
    acc = 0
    for i in range(len(y1)):
        gold_ans = y1[i]
        pred_ans = y2[i]
        if ';' in gold_ans:
            anss = gold_ans.split(';')
            ans1 = anss[0]
            ans2 = anss[1]
            if abs(float(pred_ans)-float(ans1)) < 1e-5 or abs(float(pred_ans)-float(ans2)) < 1e-5:
                acc += 1
        else:
            if abs(float(pred_ans)-float(gold_ans)) < 1e-5:
                acc += 1
return acc/len(y1)

In [30]:
vectorizer = TfidfVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,3),
                           #  stop_words='english',
                            # min_df=2
                            )
x = vectorizer.fit_transform(x_train)

lr = LinearRegression().fit(x, y_train)

y_pred = lr.predict(x)
print(get_accuracy(y_train, y_pred))
print("---------------Val metrics------------------------")
x = vectorizer.transform(x_val)
y_pred = lr.predict(x)
print(get_accuracy(y_val, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = lr.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.79      0.86     40159
           1       0.56      0.83      0.67     10964
           2       0.91      0.87      0.89     13639
           3       0.47      0.91      0.62      2061

    accuracy                           0.82     66823
   macro avg       0.72      0.85      0.76     66823
weighted avg       0.86      0.82      0.83     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.90      0.75      0.82     10128
           1       0.46      0.70      0.56      2644
           2       0.89      0.83      0.86      3450
           3       0.32      0.63      0.42       484

    accuracy                           0.76     16706
   macro avg       0.64      0.73      0.66     16706
weighted avg       0.81      0.76      0.77     16706



In [32]:
vectorizer = TfidfVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.76      0.82     40159
           1       0.49      0.77      0.60     10964
           2       0.91      0.81      0.86     13639
           3       0.37      0.70      0.48      2061

    accuracy                           0.77     66823
   macro avg       0.67      0.76      0.69     66823
weighted avg       0.82      0.77      0.78     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.89      0.74      0.81     10128
           1       0.45      0.72      0.55      2644
           2       0.90      0.79      0.84      3450
           3       0.30      0.62      0.40       484

    accuracy                           0.74     16706
   macro avg       0.63      0.72      0.65     16706
weighted avg       0.80      0.74      0.76     16706

