In [None]:
import warnings
warnings.filterwarnings('ignore')

# DATA MANIPULATION
import pandas as pd
import numpy as np
import math

# DATA VISUALIZATION
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.decomposition import PCA

# Interaction
from ipywidgets import widgets, interact

In [None]:
import numpy as np
import math


In [None]:

# DATA VISUALIZATION
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.decomposition import PCA

# Interaction
from ipywidgets import widgets, interact

In [None]:
raw_data = pd.read_csv('kc_house_data.csv')
df = raw_data.copy()
df.head()

In [None]:
df.sample(1).iloc[0]

In [None]:
print("There are {} observations and {} features in this dataset. \n".format(df.shape[0],df.shape[1]))

In [None]:
df.info()

In [None]:
print("Number of Duplicated Data:",df.duplicated().sum())

In [None]:
print("Number of Missing Values:\n",df.isna().sum())

In [None]:
df.describe()

In [None]:
# Showing the main charactaristics of each column in the dataset
@interact(column = df.columns)
def column_info(column):
    print("*" * 10, column, "*" * 10)
    display(df[column].value_counts())
    print("-" * 30)
    display(df[column].describe())
    print("=" * 40)

In [None]:
# Convert ("date") column into datetime.
df["date"] = pd.to_datetime(df['date'])

In [None]:
# make a new columns from date column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

In [None]:
# Round up values to the nearest whole number & Convert it to int
df['room_has_bathroom'] = (df['bathrooms'] % 1 == 0.5).astype(int)
df["bathrooms"] = df["bathrooms"].apply(math.ceil)

In [None]:
# Round up values to the nearest whole number & Convert it to int
df["floors"] = df["floors"].apply(math.ceil)

In [None]:
# Drop unneeded columns for now.
df.drop(columns=["id","zipcode"], inplace = True)

In [None]:
# Knowing how many rooms with 0 value.
print("Number of rows containing 0 in 'bedrooms' column:", (df['bedrooms'] == 0).sum())
print("Number of rows containing 0 in 'bathrooms' column:", (df['bathrooms'] == 0).sum())

In [None]:
# So it turns out it's a very small among of data, so we will drop it.
df.drop(df[df['bedrooms'] == 0].index, inplace=True)
df.drop(df[df['bathrooms'] == 0].index, inplace=True)


In [None]:
# Make the age column to see the age of the building.
df["age"] = np.abs(df["year"] - df["yr_built"])


In [None]:
# Make the renov_age to see how many years passed since the last renovation.
df['renov_age'] = np.abs(df['yr_renovated'] - df['yr_built'])
# Make sure if the building hav'nt been renovation before to put the value = 0.
df['renov_age'] = df["renov_age"].apply(lambda x: x if len(str(int(x)))==2 else 0.0)

In [None]:
# Showing the house with 33 bedrooms to know whether to change it to 3 or what.
df[df['bedrooms'] == 33]

In [None]:
# Change the outlier value in beedrooms column.
df['bedrooms'] = df['bedrooms'].replace(33, 3)


In [None]:
# Create a new column to put the grades into categoreis.
bins = [0, 5, 9, 13]
labels = ['Low', 'Avg', 'High']
df['grade_group'] = pd.cut(df['grade'], bins=bins, labels=labels, right=True)

In [None]:
# Drop uneedded columns.
df.drop(columns=["date","grade","yr_built","yr_renovated"], inplace = True)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:

df.hist(figsize=(30,20));

In [None]:
sns.histplot(df['price'],kde=True,bins=50);

In [None]:
log_price = np.log1p(df["price"])
plt.figure(figsize=(8, 6))
plt.hist(log_price, bins=30);

In [None]:
sns.countplot(x='view',data=df);

In [None]:
sns.countplot(x='waterfront',data=df);

In [None]:
sns.lineplot(x='year',y='price',data=df);

In [None]:
sns.countplot(x='grade_group',data=df);

In [None]:
sns.countplot(x='bathrooms',data=df);

In [None]:
sns.countplot(x='floors',data=df);

In [None]:
sns.countplot(x='condition',data=df);

In [None]:
sns.countplot(x='room_has_bathroom',data=df);

In [None]:
# Encoding the grade_group column
df_encoded = pd.get_dummies(df, columns=['grade_group'])

In [None]:
# Show the Correlations for the price.
corr_matrix = df_encoded.corr()
corr_matrix["price"].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True);

In [None]:
# Split the data to our feature matrix & taget vector
X = df_encoded.drop(columns=["price","sqft_living15","sqft_lot15","year","month"])
y = df_encoded["price"]

In [None]:
# Spliting the do to Training and Testing data (80%, 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
print("X_train Shape:",X_train.shape)
print("y_train Shape:",y_train.shape)
print("X_test Shape:",X_test.shape)
print("y_test Shape:",y_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [None]:
from sklearn.impute import SimpleImputer
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)

In [None]:
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
dt = DecisionTreeRegressor()
dt.fit(X_train_pca, y_train_log)

In [None]:
pred = dt.predict(X_test_pca)
mean_absolute_error(y_test_log,pred)

In [None]:
pred = np.expm1(dt.predict(X_test_pca))
mean_absolute_error(np.expm1(y_test_log),pred)

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train_pca, y_train_log)

In [None]:
pred = xgb.predict(X_test_pca)
mean_absolute_error(y_test_log,pred)

In [None]:
pred = np.expm1(xgb.predict(X_test_pca))
mean_absolute_error(np.expm1(y_test_log),pred)

In [None]:
svr = SVR(kernel='rbf')
svr.fit(X_train_pca, y_train_log)

In [None]:
pred = svr.predict(X_test_pca)
mean_absolute_error(y_test_log,pred)

In [None]:
pred = np.expm1(svr.predict(X_test_pca))
mean_absolute_error(np.expm1(y_test_log),pred)

In [None]:
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}
svr = SVR()
grid_search = GridSearchCV(svr, param_grid, cv=5)
grid_search.fit(X_train_pca, y_train_log)
best_params = grid_search.best_params_

# 4. Model Training
best_svr = SVR(**best_params)
best_svr.fit(X_train_pca, y_train_log)

# 5. Model Evaluation
pred = best_svr.predict(X_test_pca)
mae = mean_absolute_error(y_test_log, pred)
print("Mean Absolute Error:", mae)

In [None]:
best_params

In [None]:
grid_search.best_score_

In [None]:
# Trying the best parameters.
svr = SVR(kernel='rbf', C=1,gamma = 0.1)
svr.fit(X_train_pca, y_train_log)
pred = svr.predict(X_test_pca)
mean_absolute_error(y_test_log, pred)

In [None]:
pred = svr.predict(X_train_pca)
mean_absolute_error(y_train_log, pred)

In [None]:
pred = np.expm1(svr.predict(X_test_pca))
mean_absolute_error(np.expm1(y_test_log),pred)

In [None]:
import pickle

pickle.dump(svr,open('lin_model.pkl','wb'))