In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
house_df= pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
house_df.head()

In [None]:
#checking for null values
house_df.isnull().sum()

In [None]:
#Information about the dataset
house_df.info()

In [None]:
#describing the dataset
house_df.describe()

**Exploratary Data Analysis**

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(x=house_df['bedrooms'], y=house_df['price'], color='teal')
plt.show()

In [None]:
#from the above plot we observe a house with more than 30 bedrooms. 
#Checking out the data of that particular row
house_df[house_df['bedrooms']>30]

In [None]:
#we observe that sqft_lot =6000 with nearly 1620 sqft of living space and 33 bedrooms which is practically not possible.
#so dropping this column
house_df.drop(15870, inplace=True)

In [None]:
#plotting houseprice vs bedrooms again gives
plt.figure(figsize=(12,10))
sns.barplot(x=house_df['bedrooms'], y=house_df['price'], color='teal')
plt.show()

In [None]:
#plotting price and bathroom using histogram
plt.figure(figsize=(12,10))
plt.hist(x=house_df['floors'])
plt.show()

In [None]:
#we observe that most of the houses have 1 bathroom
#plotting price and bathrooms
plt.figure(figsize=(12,10))
sns.barplot(x=house_df['bathrooms'], y=house_df['price'], color='olive')
plt.show()
#we see that output looks left skewed.

In [None]:
#plotting sqft_living with Price
plt.figure(figsize=(12,10))
plt.scatter(house_df['sqft_living'],house_df['price'],color='green')
plt.show()
#we see that maximum concentration of prices is situated at the lower end.

In [None]:
from sklearn.model_selection import train_test_split
y=house_df['price']
X=house_df[['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront',
            'view','condition','grade','sqft_above','sqft_basement','yr_built',
            'yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15']]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.33, random_state = 42)
print(f'Total # of sample in whole dataset: {len(X)}')
print("*****"*10)
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Shape of X_train: {X_train.shape}')
print("*****"*10)
print(f'Total # of sample in test dataset: {len(X_test)}')
print(f'Shape of X_test: {X_test.shape}')

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept = True)

In [None]:
lr.fit(X_train, y_train)

In [None]:
#predicting
pred = lr.predict(X_test)

In [None]:
#Results
train_score = lr.score(X_train, y_train)
print(f'Train score of trained model: {train_score*100}')

test_score = lr.score(X_test, y_test)
print(f'Test score of trained model: {test_score*100}')

In [None]:
#Viewing outputs
plt.figure(figsize=(21,15),dpi=96)
sns.set_theme(style="white")
sns.jointplot(x=y_test, y=pred, kind='reg', line_kws={"color": "red"})
plt.show()

In [None]:
## EVALUATION OF THE MODEL ##

In [None]:
#Mean Squared Error, root mean square error, Mean absolute error, Mean absolute Percentage Error
import sklearn.metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error
print("Mean Squared Error:",mean_squared_error(y_test, pred))
print()
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test, pred)))
print()
print("Mean Absolute Error:",mean_absolute_error(y_test, pred))
print()
print("Mean Absolute Percentage Error:",np.mean(np.abs( (y_test-pred) / y_test))*100)
#The values are high as prices are in billions

In [None]:
#R2 value
from sklearn.metrics import r2_score
print("R^2:",r2_score(y_test, pred))

In [None]:
import sklearn.metrics as m
n=X_test.shape[0]
p=X_test.shape[1] - 1
R2 = m.r2_score(y_test, pred)

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

In [None]:
house_df.to_csv('submission.csv',index=False)