# Importing Libraries

In [None]:
!pip install datasist

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Dealing with data and visualization
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline

# For machine learning phase
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler

# for detecting outliers
import datasist as ds


warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize':[10,10]},font_scale=1.2)
plt.style.use('fivethirtyeight')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading the data
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')
df

> # Data Exploration

In [None]:
df.info()

**We can deduce that there're no missing values**

In [None]:
df.describe(include='all')

In [None]:
# Showing full info about the data
for col in df.columns:
    print(f'For column {col}')
    print(f'Values count:\n{df[col].value_counts()}\n')
    print(f'Unique values: {df[col].unique()}')
    print(f'No. of unique values: {df[col].nunique()}')
    print(f'No. of missing values: {df[col].isnull().sum()}')
    print('*' * 80)
    
print(f'No. of duplicated rows : {df.duplicated().sum()}')

> # Data Wrangling

In [None]:
# First We should convert some float variables into int
df['price'] = round(df['price']).astype('int64')
df['bathrooms'] = round(df['bathrooms']).astype('int64')
df['floors'] = round(df['floors']).astype('int64')

In [None]:
# Then we can convert the data column to datetime type
df['date'] = df['date'].apply(lambda x:f'{x[:4]}-{x[4:6]}-{x[6:8]}')

In [None]:
df['date'] = pd.to_datetime(df['date'],format="%Y-%m-%d")

In [None]:
df['date']

In [None]:
df['date'].dt.year.unique()

**So the date column won't be useful so we can drop it**

In [None]:
df.drop(['date','id'],axis=1,inplace=True)

In [None]:
# We can rearrange the columns
df = df[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price']]

In [None]:
df

> # Data Visualization

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)

**According to the heatmap, we can drop some unseful columns for better performance in the machine learning phase**

In [None]:
df = df[['bedrooms', 'bathrooms', 'sqft_living', 'floors',
       'waterfront', 'view', 'condition', 'grade',
       'sqft_basement','price']]

In [None]:
sns.pairplot(df)

In [None]:
sns.countplot(df['bedrooms'])

In [None]:
sns.countplot(df['bathrooms'])

In [None]:
sns.countplot(df['floors'])

In [None]:
sns.countplot(df['waterfront'])

In [None]:
sns.countplot(df['view'])

In [None]:
sns.countplot(df['condition'])

In [None]:
sns.countplot(df['grade'])

In [None]:
sns.kdeplot(df['price'],shade=True)

In [None]:
sns.boxplot(df['price'])

**There're many outliers in the price column**

In [None]:
sns.kdeplot(df['bathrooms'],shade=True)

In [None]:
sns.kdeplot(df['bedrooms'],shade=True)

In [None]:
sns.boxplot(df['bedrooms'])

In [None]:
sns.kdeplot(df['sqft_living'],shade=True)

In [None]:
sns.boxplot(df['sqft_living'])

In [None]:
sns.kdeplot(df['sqft_basement'],shade=True)

In [None]:
sns.boxplot(df['sqft_basement'])

**So from the figures above we can deduce that there're many outliers in most of columns**

> # Machine Learning

**We will do 2 trials of machine learning on the data, first one is without removing the outliers and the second one after removing them**

In [None]:
x = df.drop('price',axis=1)
y = df['price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42,test_size=0.25)

In [None]:
# We are using RobustScaler to reduce the effect of outliers
scaler = RobustScaler()

In [None]:
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
models = {'LR':LinearRegression(),
         'DTR':DecisionTreeRegressor(),
         'RFR':RandomForestRegressor(),
         'XGR':XGBRegressor()}

**Before removing the outliers**

In [None]:
for name,model in models.items():
    print(f'For model {name}')
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(f'Testing score = {model.score(x_test,y_test)}')
    print(f'Trainig score = {model.score(x_train,y_train)}')
    print(f'R2 score = {r2_score(y_test,y_pred)}')
    print(f'RMSE = {np.sqrt(mean_squared_error(y_pred,y_test))}')
    print('*' * 50)

**We can deduce that XGR is the best model**

**The second trial is after removing the outliers**

In [None]:
outliers = ds.structdata.detect_outliers(df,0,df.columns)

In [None]:
df = df[~df.index.isin(outliers)]

In [None]:
df

In [None]:
x = df.drop('price',axis=1)
y = df['price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42,test_size=0.25)

In [None]:
# We can use standardscaler as the outliers don't exist now
scaler = StandardScaler()

In [None]:
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
for name,model in models.items():
    print(f'For model {name}')
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(f'Testing score = {model.score(x_test,y_test)}')
    print(f'Trainig score = {model.score(x_train,y_train)}')
    print(f'R2 score = {r2_score(y_test,y_pred)}')
    print(f'RMSE = {np.sqrt(mean_squared_error(y_pred,y_test))}')
    print('*' * 50)

**Removing the outliers wasn't efficient enough so it's better to leave the outliers without removing them**