In [None]:
# For linear algebra,
import numpy as np

# Data
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Model building and helper libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn import metrics
from sklearn import model_selection, tree, linear_model
from scipy.stats import pearsonr
import math

In [None]:
# Read the data into a data frame
data = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
# Check the number of data points in the data set
print(len(data))
# Check the number of features in the data set
print(len(data.columns))
# Check the data types
print(data.dtypes.unique())

In [None]:
# Independent variables also known as features
features = data.iloc[:,3:].columns.tolist()
# Dependent Variables also known as target
target = data.iloc[:,2].name

In [None]:
# Dictionary to store correlations key: feature_name, value: correlation between feature and target
correlations = {}
for f in features:
    data_temp = data[[f,target]]
    x1 = data_temp[f].values
    x2 = data_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(x1,x2)[0]

In [None]:
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]

In [None]:
y = data.loc[:,['sqft_living','grade',target]].sort_values(target, ascending=True).values
x = np.arange(y.shape[0])

In [None]:
%matplotlib inline
plt.subplot(3,1,1)
plt.plot(x,y[:,0])
plt.title('Sqft and Grade vs Price')
plt.ylabel('Sqft')

plt.subplot(3,1,2)
plt.plot(x,y[:,1])
plt.ylabel('Grade')

plt.subplot(3,1,3)
plt.plot(x,y[:,2],'r')
plt.ylabel("Price")

plt.show()

In [None]:
# Train a linear regression model
regr = linear_model.LinearRegression()
new_data = data[['sqft_living','grade', 'sqft_above', 'sqft_living15','bathrooms','view',
                 'sqft_basement','lat','waterfront','yr_built','bedrooms']]

In [None]:
# X -> Independent variables
# y -> Dependent variable
X = new_data.values
y = data.price.values

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                                                    test_size=0.2, 
                                                                    random_state=4)

In [None]:
# Training the model
regr.fit(X_train, y_train)

In [None]:
# Predicting on test data
predictions = regr.predict(X_test)
print(predictions)

In [None]:
print(f'Mean Squared Error: {metrics.mean_squared_error(predictions,y_test)}')
print(f'Mean Absolute Error: {metrics.mean_absolute_error(predictions,y_test)}')
print(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(predictions,y_test))}')