# import libraries and load data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv', parse_dates=['date'])

* id column is not require as index already available

In [None]:
data = data.drop(['id'], axis = 1)
data.head()

# Exoratory data analysis

* info about data 

In [None]:
data.info()

## null checking for data
* no null value in data

In [None]:
data.isna().sum()

## unique value count

In [None]:
df = pd.DataFrame(data.nunique())
df.plot(kind = 'bar')
plt.show()
df.head(len(df))

In [None]:
data.describe()

## target value outlier check and remove

In [None]:
sns.boxplot(data = data, x = 'price')
plt.show()
sns.distplot(data['price'])
plt.show()

* remove outlier value

In [None]:
all_data = data[data.price < 2000000]

* target after remove outlier

In [None]:
sns.boxplot(data = all_data, x = 'price')
plt.show()
sns.distplot(all_data['price'])
plt.show()

## correlation between feature
* print correlation with target value

In [None]:
corr = all_data.corr()
plt.figure(figsize = (15, 7))
sns.heatmap(corr)
plt.show()
corr[['price']].sort_values(by = 'price', ascending = False)

## grade vs price(0.68 corr)

In [None]:
sns.jointplot(data = all_data, x = 'price', y = 'grade')

## price vs sqft_living

In [None]:
sns.jointplot(data = all_data, x = 'price', y = 'sqft_living')

In [None]:
sns.jointplot(data = all_data, x = 'price', y = 'sqft_living15')

## price vs sqft_above

In [None]:
sns.jointplot(data = all_data, x = 'price', y = 'sqft_above')

In [None]:
sns.jointplot(data = all_data, x = 'price', y = 'bathrooms')

In [None]:
categorical_columns = ['waterfront' , 'view', 'condition', 'grade', 'yr_built', 'yr_renovated', 'zipcode']

# Feature Engineering

In [None]:
all_data.head()

In [None]:
all_data['is_renovated'] = all_data['yr_renovated'].apply(lambda x : x != 0)

In [None]:
sns.catplot(x = 'is_renovated', y = 'price', data = all_data)

In [None]:
all_data['sold_year'] = all_data['date'].apply(lambda x : x.year)
all_data['age'] = all_data['sold_year'] - all_data['yr_built']

In [None]:
sns.catplot(x = 'age', y = 'price', data = all_data)

In [None]:
all_data['renovated_age'] = all_data['sold_year'] - all_data['yr_renovated']
all_data['renovated_age'] = all_data['renovated_age'].apply(lambda x : 10000 if x < 0 else x)

In [None]:
sns.catplot(x = 'renovated_age', y = 'price', data = all_data)

# Data Modeling

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

* train and test data spllit

In [None]:
x,y = all_data.drop(['price', 'date'], axis = 1), all_data['price']
x_train, x_val, y_train, y_val = train_test_split(x, y , random_state = 1)

In [None]:
performance_dataframe = pd.DataFrame({'model':[], 'score':[], 'r2_score':[], "mse":[], "mae":[]})

## LinearRegression

In [None]:
#train model
lr = LinearRegression().fit(x_train, y_train)
y_pre = lr.predict(x_val)
score = lr.score(x_val, y_val)
r_score = r2_score(y_pre, y_val) # r2_score
mae = mean_absolute_error(y_pre, y_val) # r2_score
mse = mean_squared_error(y_pre, y_val) # r2_score

performance_dataframe.loc[performance_dataframe.shape[0]] = ['LinearRegression', score, r_score, mse, mae]

## RandomForestRegressor

In [None]:
#train model
rf = RandomForestRegressor().fit(x_train, y_train)
y_pre = rf.predict(x_val)
score = rf.score(x_val, y_val)
r_score = r2_score(y_pre, y_val) # r2_score
mae = mean_absolute_error(y_pre, y_val) # r2_score
mse = mean_squared_error(y_pre, y_val) # r2_score

performance_dataframe.loc[performance_dataframe.shape[0]] = ['RandomForestRegressor', score, r_score, mse, mae]

In [None]:
performance_dataframe