In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, classification_report

In [None]:
dataset = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.isnull().sum()

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(data=dataset.drop(['id', 'date', 'price'], axis=1).corr(), annot=True, mask=np.triu(dataset.drop(['id', 'date', 'price'], axis=1).corr()))

In [None]:
plt.figure(figsize=(15,12))
sns.displot(dataset.price, kde=True)

In [None]:
len(dataset)*0.01
# 1% outliers

In [None]:
dataset = dataset.sort_values(['price'], ascending=False).iloc[216:]
dataset = dataset.drop(columns=['id'], axis=1)

In [None]:
plt.figure(figsize=(15,12))
sns.displot(dataset.price, kde=True)

## Linear Regression with Correlation Feature Selection

In [None]:
a = dataset.corr()['price']

In [None]:
corr_data = pd.DataFrame(a).reset_index()
corr_data.columns = ['Features', 'Correlation_with_price']

In [None]:
corr_data = corr_data.sort_values('Correlation_with_price')

In [None]:
list(corr_data[corr_data['Correlation_with_price']>0.2].Features)

In [None]:
X = dataset[['floors',
 'sqft_basement',
 'bedrooms',
 'view',
 'lat',
 'bathrooms',
 'sqft_above',
 'sqft_living15',
 'sqft_living',
 'grade']]

In [None]:
y = dataset[['price']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
r2_score(y_test, linear_model.predict(X_test))

## Linear Regression with all Features

In [None]:
X = dataset.drop(['date', 'price'], axis=1)

In [None]:
y = dataset['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
r2_score(y_test, linear_model.predict(X_test))

In [None]:
plt.scatter(y_test, linear_model.predict(X_test), color="b")
plt.plot(y_test,y_test, color="r")

## Polynomial Regression

In [None]:
poly_converter = PolynomialFeatures(degree = 2, include_bias=False)

In [None]:
poly_features = poly_converter.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.2, random_state=42)

In [None]:
poly_model = LinearRegression()

In [None]:
poly_model.fit(X_train, y_train)

In [None]:
poly_test_predictions = poly_model.predict(X_test)

In [None]:
r2_score(y_test, poly_test_predictions)

In [None]:
plt.scatter(y_test,poly_test_predictions, color="b")
plt.plot(y_test,y_test, color="r")

## Decision Tree

In [None]:
X = dataset.drop(['date', 'price'], axis=1)
X

In [None]:
y = dataset.price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
r_dt = DecisionTreeRegressor(random_state=0)
r_dt.fit(X_train, y_train)

In [None]:
r_dt.feature_importances_

In [None]:
r2_score(y_test, r_dt.predict(X_test))