In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
housing_data = pd.read_csv("../input/housesalesprediction/kc_house_data.csv")
df = pd.DataFrame(housing_data)
df.head()

# 1. Cleaning Data

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), square=True,annot=True,cmap= 'twilight_shifted')

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
# date
df["date"]=pd.to_datetime(df["date"])
df['month'] = df["date"].apply(lambda date: date.month)
df['year'] = df["date"].apply(lambda date: date.year)
df.drop("date", axis=1, inplace=True)

In [None]:
#yr_renovate
df['renovated'] = np.where(df['yr_renovated'] > 0, 1, 0)
df.drop("yr_renovated", axis=1, inplace=True)

In [None]:
df.drop("id", axis=1, inplace=True)
df.drop("zipcode", axis=1, inplace=True)

In [None]:
duplicateRowsDF = df[df.duplicated()]
df.drop(duplicateRowsDF.index, axis = 0, inplace=True)

In [None]:
df.head(2)

### 1.1 Removing outlier taking only 99% of data

In [None]:
def displot(df):
    '''Display sns displot for all the columns inside data frame'''
    plt.figure(figsize=(20,25), facecolor='white')
    plotnumber = 1

    for column in df:
        if plotnumber<=21 :
            ax = plt.subplot(7,3,plotnumber)
            plt.hist(df[column])
            #sns.distplot(df[column])
            plt.xlabel(column,fontsize=20)
        plotnumber+=1
    plt.tight_layout()

In [None]:
displot(df)

In [None]:
def quantile_df(df):
    '''Quantile all the columns for dataframe cutoff is 0.99 or 3rd standard diviation'''
    temp_df = pd.DataFrame()
    
    for column in df:
        if temp_df.empty == True:
            q = df[column].quantile(0.99)
            temp_df = df[df[column]<q] 
        else:
            q = temp_df[column].quantile(0.99)
            temp_df = temp_df[temp_df[column]<q] 
    return temp_df.reset_index(drop=True)

In [None]:
cleaned_df = quantile_df(df)
cleaned_df.describe()

In [None]:
displot(cleaned_df)

# 2. Checking Linearity

In [None]:
X = cleaned_df.drop(['price'], axis = 1)
y = cleaned_df['price']

In [None]:
def scatter_plot(X, y):
    '''Scatter plot'''
    plt.figure(figsize=(20,30), facecolor='white')
    plotnumber = 1

    for column in X:
        if plotnumber<=21 :
            ax = plt.subplot(7,3,plotnumber)
            plt.scatter(X[column],y)
            plt.xlabel(column,fontsize=20)
            plt.ylabel('PRICE',fontsize=20)
        plotnumber+=1
    plt.tight_layout()

In [None]:
scatter_plot(X, y)

### 2.1 Log Transformation

In [None]:
y = np.log(y)
scatter_plot(X, y)

# 3. Model

### 3.1 Scale the data

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalar.fit(X)

In [None]:
X_scaled = scalar.transform(X)

### 3.2 Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 3.3 Variance Threashold

In [None]:
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train)

In [None]:
constant_columns = [column for column in X.columns
                    if column not in X.columns[var_thres.get_support()]]

constant_columns

In [None]:
## Deleting these 2 columns
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_train_df.drop(constant_columns, axis=1, inplace=True)
X_train = X_train_df.values

### 3.4 Create Regression

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
y_hat = reg.predict(X_train)

In [None]:
# Scatter Plot check
plt.scatter(y_train, y_hat)
plt.xlabel('Target-> y_train', size=18)
plt.ylabel('Prediction-> y_hat', size=18)
plt.show()

In [None]:
# Residuals plot check
plt.hist(y_train - y_hat)
#sns.displot(y_train - y_hat)
plt.title('Residuals PDF', size=18)

In [None]:
# Checking R2 value
reg.score(X_train, y_train)

### 3.5 Finding Weights and Bias

In [None]:
#Bias
reg.intercept_

In [None]:
weight_summery = pd.DataFrame(X_train_df.columns, columns=['Features'])
weight_summery['Weights'] = reg.coef_
weight_summery