In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

# EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

**To wacth the corelation between variables,lets make a matrix:**

In [None]:
plt.figure(figsize=(10, 10))
df_corr = df.drop(['id', 'date', 'lat', 'long', 'zipcode'], axis=1)
df_corr = df_corr.corr()
mask = np.triu(df_corr.corr())
sns.heatmap(df_corr, annot=True, mask=mask, square=True, 
            fmt='.1g',  
            vmin=-1, vmax=1, center= 0, cmap='coolwarm',
            linewidths=3, linecolor='black',
            cbar_kws= {'orientation': 'vertical'}) 

**The highest correlation between the target and the independent variables:**

* sqft_living;
* sqft_above;
* grade;
* sqft_living15.

**As the most part of the 'yr_renovated' values are 0s, we will perform the conversion, making a column that will reflect whether the repair was carried out or not.**

In [None]:
list_ = []
for x in df['yr_renovated']:
    if x!=0:
        list_.append(1)
    else:
        list_.append(0)
df['renovated'] = list_

In [None]:
figure = plt.figure()
axes1 = figure.add_axes([0, 0, 1, 1]) 
axes2 = figure.add_axes([1.2, 0, 2.2, 1]) 
axes3 = figure.add_axes([0, -1.3, 1, 1]) 
axes4 = figure.add_axes([1.2, -1.3, 1, 1]) 
axes5 = figure.add_axes([2.4, -1.3, 1, 1]) 
axes6 = figure.add_axes([0, -2.6, 1, 1]) 
axes7 = figure.add_axes([1.2, -2.6, 1, 1]) 
axes8 = figure.add_axes([2.4, -2.6, 1, 1]) 
axes9 = figure.add_axes([0, -3.9, 1, 1]) 
axes10 = figure.add_axes([1.2, -3.9, 1, 1]) 
axes11 = figure.add_axes([2.4, -3.9, 1, 1])
axes12 = figure.add_axes([0, -5.2, 2.2, 1]) 
axes13 = figure.add_axes([2.4, -5.2, 1, 1])
axes14 = figure.add_axes([0, -6.5, 1.6, 1]) 
axes15 = figure.add_axes([1.8, -6.5, 1.6, 1]) 

sns.stripplot(x='bedrooms', y='price', data=df, palette='coolwarm', ax=axes1)
axes1.set_title('Price depending of numbers of bedrooms', fontsize=17)
axes1.set_xlabel('bedrooms', fontsize=14)
axes1.set_ylabel('price', fontsize=14)

sns.stripplot(x='bathrooms', y='price', data=df, palette='coolwarm', ax=axes2)
axes2.set_title('Price depending of numbers of bathrooms', fontsize=17)
axes2.set_xlabel('bathrooms', fontsize=14)
axes2.set_ylabel('price', fontsize=14)

sns.regplot(x='sqft_living', y='price', data=df, color='royalblue', ax=axes3, scatter_kws={'s' : 10}, line_kws={'color': 'crimson', 'lw': 5})
axes3.set_title('Price depending of sqft_living', fontsize=17)
axes3.set_xlabel('sqft_living', fontsize=14)
axes3.set_ylabel('price', fontsize=14)

sns.regplot(x='sqft_lot', y='price', data=df, color='royalblue', ax=axes4, scatter_kws={'s' : 10}, line_kws={'color': 'crimson', 'lw': 5})
axes4.set_title('Price depending of sqft_lot', fontsize=17)
axes4.set_xlabel('sqft_lot', fontsize=14)
axes4.set_ylabel('price', fontsize=14)

sns.stripplot(x='floors', y='price', data=df, palette='coolwarm', ax=axes5)
axes5.set_title('Price depending of floors', fontsize=17)
axes5.set_xlabel('floors', fontsize=14)
axes5.set_ylabel('price', fontsize=14)

sns.stripplot(x='waterfront', y='price', data=df, palette=['royalblue', 'crimson'], ax=axes6)
axes6.set_title('Price depending of waterfront', fontsize=17)
axes6.set_xlabel('waterfront', fontsize=14)
axes6.set_ylabel('price', fontsize=14)

sns.stripplot(x='view', y='price', data=df, palette='coolwarm', ax=axes7)
axes7.set_title('Cost depending of view', fontsize=17)
axes7.set_xlabel('view', fontsize=14)
axes7.set_ylabel('price', fontsize=14)

sns.stripplot(x='condition', y='price', data=df, palette='coolwarm', ax=axes8)
axes8.set_title('Price depending of condition', fontsize=17)
axes8.set_xlabel('condition', fontsize=14)
axes8.set_ylabel('price', fontsize=14)

sns.stripplot(x='grade', y='price', data=df, palette='coolwarm', ax=axes9)
axes9.set_title('Price depending of grade', fontsize=17)
axes9.set_xlabel('grade', fontsize=14)
axes9.set_ylabel('price', fontsize=14)

sns.regplot(x='sqft_above', y='price', data=df, color='royalblue', ax=axes10, scatter_kws={'s' : 10}, line_kws={'color': 'crimson', 'lw': 5})
axes10.set_title('Price depending of sqft_above', fontsize=17)
axes10.set_xlabel('sqft_above', fontsize=14)
axes10.set_ylabel('price', fontsize=14)
                         
sns.regplot(x='sqft_basement', y='price', data=df, color='royalblue', ax=axes11, scatter_kws={'s' : 10}, line_kws={'color': 'crimson', 'lw': 5})
axes11.set_title('Price depending of sqft_basement', fontsize=17)
axes11.set_xlabel('sqft_basement', fontsize=14)
axes11.set_ylabel('price', fontsize=14)

sns.lineplot(x='yr_built', y='price', data=df, color='royalblue', ax=axes12, lw=5, marker='o', markerfacecolor='crimson')
axes12.set_title('Price depending of yr_built', fontsize=17)
axes12.set_xlabel('yr_built', fontsize=14)
axes12.set_ylabel('price', fontsize=14)

sns.stripplot(x='renovated', y='price', data=df, palette=['royalblue', 'crimson'], ax=axes13)
axes13.set_title('Price depending of renovated', fontsize=17)
axes13.set_xlabel('renovated', fontsize=14)
axes13.set_ylabel('price', fontsize=14)

sns.regplot(x='sqft_living15', y='price', data=df, color='royalblue', ax=axes14, scatter_kws={'s' : 10}, line_kws={'color': 'crimson', 'lw': 5})
axes14.set_title('Price depending of sqft_living15', fontsize=17)
axes14.set_xlabel('sqft_living15', fontsize=14)
axes14.set_ylabel('price', fontsize=14)

sns.regplot(x='sqft_lot15', y='price', data=df, color='royalblue', ax=axes15, scatter_kws={'s' : 10}, line_kws={'color': 'crimson', 'lw': 5})
axes15.set_title('Price depending of sqft_lot15', fontsize=17)
axes15.set_xlabel('sqft_lot15', fontsize=14)
axes15.set_ylabel('price', fontsize=14)

* bedrooms - up to 5 bedrooms, we observe a proportional increase in the number of bedrooms and the cost. After 5, with the presence of some outliers, we can see the opposite situation ;
* bathrooms - there is a proportional increase in bathrooms and cost. But after 5 bathrooms, the number of observations is too small, so it is difficult to draw a conclusion;
* sqft_living - proportional increase in living space and cost;
* sqft_lot - it is difficult to make an unambiguous conclusion, for some observations, with an increase in the area, the cost also increases, but at the same time a significant part of the observations shows the opposite situation. We can conclude about a bimodal data distribution;
* floors - no obvious dependence of the cost on the number of floors is observed;
* waterfront - any dependence of the cost on the presence of a view of the embankment on the watch. We can only conclude that most of the houses do not have a view of the embankment;
* view - no clear dependence is observed;
* condition - up to 3, there is a proportional increase in value, but after 3 there are no obvious changes;
* grade - a proportional increase in quality and cost;
* sqft_above - there is a proportional increase in area and value;
* sqft_basement - there is a proportional increase in area and value;
* yr_built - the cost of houses built during the Second World War is the least. At the same time, the highest cost is for houses built in 1905, 1934, 2001 and 2014.
* renovated - I do not observe the dependence of the cost on the availability of repairs.
* sqft_living15 is the same situation as with the sqft_living.
* sqft_lot15 is a similar situation as with the sqft_lot15.

**Before that, the date column was not affected by me. I want to consider this data separately, but for this you need to perform the following transformations:**

In [None]:
df['date'] = df['date'].replace('T000000', '', regex=True)
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].apply(lambda x: x.year)
df['month'] = df['date'].apply(lambda x: x.month)
df['weekday'] = df['date'].apply(lambda x: x.dayofweek)
d = {0 : 'Monday', 1 : 'Tuesday', 2 : 'Wednesday', 3 : 'Thursday', 4 : 'Friday', 5 : 'Saturday', 6 : 'Sunday'}
df['weekday']=df['weekday'].map(d)

In [None]:
df.head()

In [None]:
df.info()

**Let's try to find the relationship between year, month, day of the week and cost.**

In [None]:
figure = plt.figure()
axes1 = figure.add_axes([0, 0, 1, 1])
axes2 = figure.add_axes([1.2, 0, 1, 1])
axes3 = figure.add_axes([2.4, 0, 1, 1])

sns.stripplot(x='year', y='price', data=df, palette=['royalblue', 'crimson'], ax=axes1)
axes1.set_title('Price depending of year', fontsize=17)
axes1.set_xlabel('Year', fontsize=14)
axes1.set_ylabel('Price', fontsize=14)

sns.stripplot(x='month', y='price', data=df, palette='coolwarm', ax=axes2)
axes2.set_title('Price depending of month', fontsize=17)
axes2.set_xlabel('Month', fontsize=14)
axes2.set_ylabel('Price', fontsize=14)

sns.stripplot(x='weekday', y='price', data=df, palette='coolwarm', ax=axes3)
axes3.set_title('Price depending of weekday', fontsize=17)
axes3.set_xlabel('Weekday', fontsize=14)
axes3.set_ylabel('Price', fontsize=14)

**I do not observe any obvious dependence, except that houses with a lower cost were sold over the weekend.**

**Let's look at this data from a different angle, correlate the month and day of the week using pivot tables and visualize it.**

In [None]:
df_pivot = df.pivot_table(index='weekday', columns='month', values='price')
df_group = df.groupby(['month', 'weekday'])['price'].count().reset_index() 
df_pivot_count = df_group.pivot_table(index='weekday', columns='month', values='price')

figure = plt.figure()
axes1 = figure.add_axes([0, 0, 1.6, 1.6]) 
axes2 = figure.add_axes([1.8, 0, 1.6, 1.6]) 


sns.heatmap(df_pivot, cmap='coolwarm', linecolor='white', linewidths=3, ax=axes1)
axes1.set_title('Sales amount by weekdays and months', fontsize=17)
axes1.set_xlabel('Month', fontsize=14)
axes1.set_ylabel('Weekday', fontsize=14)

sns.heatmap(df_pivot_count, cmap='coolwarm', linecolor='white', linewidths=3, ax=axes2)
axes2.set_title('Number of sales by weekdays and months', fontsize=17)
axes2.set_xlabel('Month', fontsize=14)
axes2.set_ylabel('Weekday', fontsize=14)

* There is no relationship between price and the ratio of the month to the day of the week;
* However, we can observe that the least number of houses were sold during the weekend. Also, the smallest number of houses were sold in the first and last 2 months.

**The decrease in the number of sales in the first and last 2 months is associated with the period: 05.2014 - 05.2015. There is no need to comment on the decrease in the number of sales on weekends:)**

# Preparing

In [None]:
df.columns

In [None]:
df.drop(['id', 'date', 'zipcode', 'lat', 'long', 'yr_renovated', 'year', 'month', 'weekday'], axis=1, inplace=True)
df.head()

In [None]:
columns = list(df.columns)
scaler = StandardScaler()
df_st = scaler.fit_transform(df)
df_st = pd.DataFrame(data=df_st, columns=columns)

In [None]:
x = df_st.drop(['price'], axis=1)
y = df_st['price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

In [None]:
# linear
params1 = {}
linear = GridSearchCV(LinearRegression(), params1, cv=5)
linear.fit(x_train, y_train)
y_predict_linear = linear.predict(x_test)
r_sq_linear = linear.score(x_test, y_test)

# poly
poly = GridSearchCV(LinearRegression(), params1, cv=5)
poly.fit(x_train_poly, y_train)
y_predict_poly = poly.predict(x_test_poly)
r_sq_poly = poly.score(x_test_poly, y_test)

# ridge
params2 = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100]}
ridge = GridSearchCV(Ridge(), params2, cv=5)
ridge.fit(x_train_poly, y_train)
y_predict_ridge = ridge.predict(x_test_poly)
r_sq_ridge = ridge.score(x_test_poly, y_test)

# lasso
params2 = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100]}
lasso = GridSearchCV(Lasso(), params2, cv=5)
lasso.fit(x_train_poly, y_train)
y_predict_lasso = lasso.predict(x_test_poly)
r_sq_lasso = lasso.score(x_test_poly, y_test)

# elastic
params3 = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
elastic = GridSearchCV(ElasticNet(), params3, cv=5)
elastic.fit(x_train_poly, y_train)
y_predict_elastic = elastic.predict(x_test_poly)
r_sq_elastic = elastic.score(x_test_poly, y_test)

# knn
params4 = {'n_neighbors': list(range(3, 10, 2)),'weights':['uniform', 'distance'],
          'metric':['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = GridSearchCV(KNeighborsRegressor(), params4, cv=5)
knn.fit(x_train, y_train)
y_predict_knn = knn.predict(x_test)
r_sq_knn = knn.score(x_test, y_test)

# rf
params5 = {'criterion' : ['mse', 'mae'],'n_estimators': list(range(5, 10, 15)), 'min_samples_leaf': [5, 3]}
rf = GridSearchCV(RandomForestRegressor(), params5, cv=5)
rf.fit(x_train, y_train)
y_predict_rf = rf.predict(x_test)
r_sq_rf = rf.score(x_test, y_test)

# Results

In [None]:
models = ['Linear', 'Poly', 'Ridge', 'Lasso', 'Elastic', 'KNN', 'RandomForest']
r_sq = [r_sq_linear, r_sq_poly, r_sq_ridge, r_sq_lasso, r_sq_elastic, r_sq_knn, r_sq_rf]
r_sq_table = pd.DataFrame({'Model':models, 'r^2':r_sq})
r_sq_table.sort_values(by='r^2', axis=0, ascending=False)

**From the presented data, it can be seen that Elastic has the best result.**