# Data Transformation
In this notebook we will build a regression model to to predict price.

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

## Loading the data

In [None]:
df = pd.read_csv('data/cleaned.csv')

## Data transformation

In [None]:
df['log_price'] = df['price'].apply(np.log10)
df['ft_living'] = df['sqft_living'].apply(np.sqrt)
df['log_sqft_lot'] = df['sqft_lot'].apply(np.log10)

## Selecting variables for regression

In [None]:
input_variables = ['bathrooms', 'ft_living', 'log_sqft_lot']
output_variable = 'log_price'
X = df[input_variables]
y = df[output_variable]

## Making test-train split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

## Visualizing selected variables

In [None]:
sns.pairplot(pd.concat([X_train,y_train], axis=1))

## Building a model

In [None]:
model = sm.OLS(y_train, X_train).fit()
model.summary()

## Model validation

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)