In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import json

In [84]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [2]:
def read_data_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [3]:
dataset = read_data_json('../data/final_combined_replicate.json')
ids = list(dataset.keys())
print(len(ids))

32532


In [4]:
valid_ids = read_data_json('../data/valid_ids.json')
print(len(valid_ids))

6506


In [5]:
test_ids = []
with open('../data/id_ans_test', 'r') as f:
    for line in f:
        test_id = line.strip().split('\t')[0]
        test_ids.append(test_id)
print(len(test_ids))

6506


In [6]:
train_ids = []
for i in ids:
    if i not in test_ids and i not in valid_ids:
        train_ids.append(i)
print(len(train_ids))

21848


In [91]:
dataset['0']

{'text': '镇海 雅乐 学校 二年级 的 小朋友 到 一条 小路 的 一边 植树 ． 小朋友 们 每隔 temp_a 米 种 一棵树 （ 马路 两头 都 种 了 树 ） ， 最后 发现 一共 种 了 temp_b 棵 ， 这 条 小路 长 多少 米 ．',
 'target_template': ['x', '=', '(', 'temp_b', '-', '1', ')', '*', 'temp_a'],
 'gen_template': [],
 'num_list': ['2', '11'],
 'ans': '20',
 'numtemp_order': ['temp_a', 'temp_b'],
 'num_position': [16, 34],
 'post_template': ['temp_b', '1', '-', 'temp_a', '*']}

In [7]:
def get_x_y(ids):
    x = []
    y = []
    for id in ids:
        x.append(dataset[id]['text'])
        if dataset[id]['ans'][-1] == 'e':
            y.append(dataset[id]['ans'][:-1])
        else:
            y.append(dataset[id]['ans'].split(';')[0])
    return x,y

In [43]:
x_train, y_train = get_x_y(train_ids)
y_train = np.array(y_train)

In [44]:
x_test, y_test = get_x_y(test_ids)
y_test = np.array(y_test)
x_val, y_val = get_x_y(valid_ids)
y_val = np.array(y_val)

In [10]:
def get_accuracy(y1, y2):
    acc = 0
    for i in range(len(y1)):
        gold_ans = y1[i]
        pred_ans = y2[i]
        if ';' in gold_ans:
            anss = gold_ans.split(';')
            ans1 = anss[0]
            ans2 = anss[1]
            if abs(float(pred_ans)-float(ans1)) < 1e-5 or abs(float(pred_ans)-float(ans2)) < 1e-5:
                acc += 1
        else:
            if abs(float(pred_ans)-float(gold_ans)) < 1e-5:
                acc += 1
    return acc/len(y1)

In [45]:
vectorizer = TfidfVectorizer(input='content', #TfidfVectorizer
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                           #  stop_words='english',
                            # min_df=2
                            )
x_train = vectorizer.fit_transform(x_train)


In [15]:
# scale_X = StandardScaler(with_mean = False)
scale_y = StandardScaler(with_mean = False)

# x = scale_X.fit_transform(x)
y_train = scale_y.fit_transform(y_train.reshape(-1, 1))

  sqr = np.multiply(arr, arr, out=arr)


In [16]:

lr = LinearRegression().fit(x, y_train)

The exact solution is  x = 0                              


In [17]:
y_pred = lr.predict(x)
print(get_accuracy(y_train, y_pred))


1.0


  


In [18]:
print("---------------Val metrics------------------------")
x = vectorizer.transform(x_val)
# x = scale_X.transform(x)
y_val = scale_y.transform(np.array(y_val).reshape(-1,1))
y_pred = lr.predict(x)
print(get_accuracy(y_val, y_pred))

---------------Val metrics------------------------
1.0


  


In [None]:




print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
x = scale_X.transform(x)
y_pred = lr.predict(x)
y_test = scale_y.transform(np.array(y_test).reshape(-1,1))
print(get_accuracy(y_test, y_pred))

In [22]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf', gamma='auto')
svr.fit(x, y_train)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
y_pred = svr.predict(x)
print(get_accuracy(y_train, y_pred))

0.0


In [24]:
print("---------------Val metrics------------------------")
x = vectorizer.transform(x_val)
y_pred = lr.predict(x)
print(get_accuracy(y_val, y_pred))

---------------Val metrics------------------------
0.006148170919151552


In [28]:
from sklearn.tree import DecisionTreeRegressor

tree_regressor = DecisionTreeRegressor(random_state = 0)
tree_regressor.fit(x, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [29]:
y_pred = tree_regressor.predict(x)
print(get_accuracy(y_train, y_pred))

0.00027462467960454045


In [30]:
print("---------------Val metrics------------------------")
x = vectorizer.transform(x_val)
y_pred = tree_regressor.predict(x)
print(get_accuracy(y_val, y_pred))

---------------Val metrics------------------------
0.0


In [31]:
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
# x = scale_X.transform(x)
y_pred = lr.predict(x)
# y_test = scale_y.transform(np.array(y_test).reshape(-1,1))
print(get_accuracy(y_test, y_pred))

---------------Test metrics------------------------
0.007070396557024286


In [36]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
forest_regressor = RandomForestRegressor(n_estimators = 300, random_state = 0)
forest_regressor.fit(x, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [42]:
y_pred = forest_regressor.predict(x)
print(get_accuracy(y_train, y_pred))

0.0


In [None]:
print("---------------Val metrics------------------------")
x_val = vectorizer.transform(x_val)
y_pred = tree_regressor.predict(x_val)
print(get_accuracy(y_val, y_pred))

In [None]:
print("---------------Test metrics------------------------")
x_test = vectorizer.transform(x_test)
# x = scale_X.transform(x)
y_pred = lr.predict(x_test)
# y_test = scale_y.transform(np.array(y_test).reshape(-1,1))
print(get_accuracy(y_test, y_pred))

In [46]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor

In [None]:
# import xgboost as xgb
# rfr = RandomForestRegressor(n_estimators=200, random_state=r)
#  xgbr = xgb.XGBRegressor(objective="reg:linear", random_state=r)

In [51]:
gbr = AdaBoostRegressor(random_state=0, n_estimators= 100)
gbr.fit(x_train, y_train)

UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')