In [1]:
import psycopg2
import sys, os
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from sqlalchemy import create_engine
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from IPython.display import display
from sklearn.metrics import mean_squared_error
from math import sqrt


svr_linear = SVR(kernel='linear', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1, verbose=True)

In [2]:
url = 'postgresql://{user}:{passwd}@{host}:{port}/{db}'.format(
         user="cse6242", passwd="cse6242", host="localhost", port=5432, db="cse6242")

engine = create_engine(url, pool_size=50)

engine.connect()
connection = engine.connect() 
query = "SELECT county_id, home_type_id, year_month, index_value from county_timeseries LIMIT 100000"

df = pd.read_sql(query, con=connection)

df.rename(columns = {'year_month':'year'}, inplace = True)
df[['year','month']] = df['year'].str.split('-',expand=True)

cols = df.columns.tolist() 
new_cols = [x for x in cols if x != cols[-2]] + [cols[-2]]
df = df[new_cols]
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)
random_state = 100
x_data = df.loc[:, df.columns != "index_value"]
y_data = df.loc[:, "index_value"]

In [3]:
# df, x_vals, y_vals

random_state = 100

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, shuffle=True, random_state=random_state)

In [4]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((75000, 4), (25000, 4), (75000,), (25000,))

In [5]:
y_train.to_frame()

Unnamed: 0,index_value
17853,60400.0
55890,60600.0
89462,159200.0
1921,179100.0
27367,83700.0
...,...
65615,89900.0
77655,151900.0
79683,67200.0
56088,105000.0


In [6]:
# x_train.iloc("131066"), y_train.iloc("131066")



def standardize(data):   
    scaler = StandardScaler()
    scaler.fit(data)
    return scaler.transform(data)


# y_train = y_train.to_frame()
# y_test = y_test.to_frame()
x_train_norm = standardize(x_train)
# y_train_norm = standardize(y_train.values.reshape(-1, 1))
x_test_norm = standardize(x_test)
# y_test_norm = standardize(y_test.values.reshape(-1, 1))

# y_train = y_train.to_frame()
# y_train.

# y_train.fit_transform(y_train.reshape(-1, 1))
print(x_train_norm)
print(x_test_norm)
print(y_train)
print(y_test)
# print(y_train_norm)
# x_train_norm = x_train / np.linalg.norm(x_train)
# x_test_norm = x_test / np.linalg.norm(x_test)
# y_train_norm = y_train / np.linalg.norm(y_train)
# y_test_norm = y_test / np.linalg.norm(y_test)

[[-0.61104215  0.          1.10479762 -1.60842962]
 [-0.57662213  0.          0.66301523 -0.73247617]
 [ 1.78545196  0.          0.81027602 -0.1485072 ]
 ...
 [ 2.12346734  0.         -1.54589671  0.14347728]
 [ 1.57355369  0.         -0.36781035 -0.73247617]
 [ 1.87338436  0.          0.66301523 -0.1485072 ]]
[[-0.61063865  0.         -1.69574308 -0.72294063]
 [-0.60795995  0.          0.95196164  1.3225664 ]
 [-0.62001408  0.         -0.37189072  1.3225664 ]
 ...
 [-0.57019034  0.         -0.96026955  0.15370524]
 [-0.61921047  0.         -0.81317484  1.61478169]
 [ 0.61566838  0.          1.24615105 -1.3073712 ]]
17853     60400.0
55890     60600.0
89462    159200.0
1921     179100.0
27367     83700.0
           ...   
65615     89900.0
77655    151900.0
79683     67200.0
56088    105000.0
38408     73400.0
Name: index_value, Length: 75000, dtype: float64
22422     85200.0
25747    145400.0
13795    215200.0
68724    106300.0
38923    138300.0
           ...   
40043    108900.0
359

In [7]:
svr_linear.fit(x_train_norm, y_train)

[LibSVM]

SVR(C=100, cache_size=200, coef0=1, degree=3, epsilon=0.1, gamma='auto',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=True)

In [8]:
y_pred_train = svr_linear.predict(x_train_norm)

In [9]:
sqrt(mean_squared_error(y_train, y_pred_train))

75280.38055516942

In [10]:
# svr_linear.fit(x_test_norm, y_test)
y_pred_test = svr_linear.predict(x_test_norm)
print(sqrt(mean_squared_error(y_test, y_pred_test)))

76261.88583913233


In [11]:
display(y_pred_train)
display(y_pred_test)

array([130381.69456879, 123305.31107076, 101798.76712647, ...,
        60356.48895215,  84622.09099784,  98516.12708314])

array([ 85491.57460808, 129296.61655904, 107999.05781597, ...,
        97402.71110374, 100992.04750518, 120261.22920713])

In [12]:
y_hats_df = pd.DataFrame(data = y_pred_train, columns = ['y_pred'], index = x_train.index.copy())
df_out_svr_train = pd.merge(df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_svr_train)

Unnamed: 0,county_id,home_type_id,year,month,index_value,y_pred
0,101,3,2007,10,164500.0,112779.163875
1,101,3,2007,11,163100.0,112920.570132
2,101,3,2007,12,161600.0,113061.976390
3,101,3,2008,01,159900.0,113889.296557
4,101,3,2008,02,158000.0,114030.702814
...,...,...,...,...,...,...
99995,435,3,1998,03,58300.0,
99996,435,3,1998,04,58600.0,89566.511420
99997,435,3,1998,05,58900.0,89707.917678
99998,435,3,1998,06,59000.0,89849.323935


In [13]:
y_hats_df = pd.DataFrame(data = y_pred_test, columns = ['y_pred'], index = x_test.index.copy())
df_out_svr_test = pd.merge(df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_svr_test)

Unnamed: 0,county_id,home_type_id,year,month,index_value,y_pred
0,101,3,2007,10,164500.0,
1,101,3,2007,11,163100.0,
2,101,3,2007,12,161600.0,
3,101,3,2008,01,159900.0,
4,101,3,2008,02,158000.0,
...,...,...,...,...,...,...
99995,435,3,1998,03,58300.0,89441.400122
99996,435,3,1998,04,58600.0,
99997,435,3,1998,05,58900.0,
99998,435,3,1998,06,59000.0,


In [14]:
final_df_out = df_out_svr_train['y_pred'].combine_first(df_out_svr_test['y_pred'])

# display(final_df_out)
# 

y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = df.index.copy())
# display(y_hats_df)
final_df_out = pd.merge(df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(final_df_out)

# y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = df.index.copy())
# df_out_test_svr = pd.merge(df, y_hats_df, how = 'left', left_index = True, right_index = True)

# display(df_out_test_svr)


# final_df_out = df_out_train_svr['y_pred'].combine_first(df_out_test_svr['y_pred'])

# display(final_df_out)

Unnamed: 0,county_id,home_type_id,year,month,index_value,y_pred
0,101,3,2007,10,164500.0,112779.163875
1,101,3,2007,11,163100.0,112920.570132
2,101,3,2007,12,161600.0,113061.976390
3,101,3,2008,01,159900.0,113889.296557
4,101,3,2008,02,158000.0,114030.702814
...,...,...,...,...,...,...
99995,435,3,1998,03,58300.0,89441.400122
99996,435,3,1998,04,58600.0,89566.511420
99997,435,3,1998,05,58900.0,89707.917678
99998,435,3,1998,06,59000.0,89849.323935


In [15]:
# final_df_out = df_out_random_forest_train['y_pred'].combine_first(df_out_random_forest_test['y_pred'])

# display(final_df_out)

# y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = df.index.copy())
# display(y_hats_df)
# final_df_out = pd.merge(df, y_hats_df, how = 'left', left_index = True, right_index = True)

# display(final_df_out)