In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler , StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error , r2_score ,mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression , Ridge , Lasso
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor as KNN
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import warnings
warnings.filterwarnings('ignore')

# Reading  The Data  

In [None]:
df=pd.read_csv("/kaggle/input/housesalesprediction/kc_house_data.csv",parse_dates=['date'])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.drop("id",axis=1,inplace=True)

# EDA


In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(24, 16))
for feature in df.columns.to_list():
    plt.subplot(4, 5, df.columns.to_list().index(feature) + 1)
    sns.histplot(data=df[feature], bins=20, kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [None]:
df["waterfront"].value_counts().reset_index()

In [None]:
df["view"].value_counts().reset_index()

In [None]:
df["yr_renovated"].value_counts().reset_index()

### Most Values of these Columns (yr_renovated,view,waterfront) is zero ,so we gonna drop them 

In [None]:
df.drop(["waterfront","view","zipcode","yr_renovated"],axis =1 ,inplace = True)

In [None]:
plt.figure(figsize=(24, 16))
for feature in df.drop('date',axis=1).columns.to_list():
    plt.subplot(4, 4, df.columns.to_list().index(feature))
    sns.boxplot(data=df[feature])
    plt.title(feature)
plt.tight_layout()
plt.show()

# scaling data with Standard Scaler

In [None]:
datetime_column = df['date']  # Save the datetime column separately


numeric_columns = df.drop(["grade","condition","floors","bathrooms","bedrooms"],axis=1).select_dtypes(include=['float64', 'int64']).columns
numeric_data = df[numeric_columns]
# Apply RobustScaler to numeric data
scaler = StandardScaler()
scaled_numeric_data = scaler.fit_transform(numeric_data)

# Combine scaled numeric data with the datetime column
scaled_df = pd.DataFrame(scaled_numeric_data, columns=numeric_columns)

# transform data with power transform (method = yeo-johnson ) to make distribution of data normalized

In [None]:
numeric_data = scaled_df[numeric_columns]

# Apply RobustScaler to numeric data
transformer = PowerTransformer(method = "yeo-johnson")
transformed_numeric_data = transformer.fit_transform(numeric_data)

# Combine scaled numeric data with the datetime column
transformed_df = pd.DataFrame(transformed_numeric_data, columns=numeric_columns)
transformed_df['date'] = datetime_column 
transformed_df[['grade','condition','floors','bathrooms','bedrooms']] = df[['grade','condition','floors','bathrooms','bedrooms']]

# Remove outliers using IQR

In [None]:
def outliers(df,ft):
    q1 = df[ft].quantile(0.25)
    q3 = df[ft].quantile(0.75)
    iqr = q3 - q1 
    
    lower_limit = q1 - iqr *1.5
    upper_limit = q3 + iqr *1.5
    
    ls = df.index[(df[ft]<lower_limit) | (df[ft]>upper_limit)]
    
    return ls

In [None]:
transformed_df.columns

In [None]:
index_list = []
for featsure in numeric_columns:
    index_list.extend(outliers(transformed_df,featsure))

In [None]:
def remove(df,ls):
    ls = sorted(set(ls))
    df = df.drop(ls)
    return df

In [None]:
transformed_df = remove(transformed_df,index_list)

In [None]:
transformed_df.shape

In [None]:
plt.figure(figsize=(24, 16))
for feature in transformed_df.columns.to_list():
    plt.subplot(5, 4, transformed_df.columns.to_list().index(feature) + 1)
    sns.histplot(data=transformed_df[feature], bins=20, kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(24, 16))
for feature in transformed_df.drop('date',axis=1).columns.to_list():
    plt.subplot(5, 4, transformed_df.columns.to_list().index(feature)+1)
    sns.boxplot(data=transformed_df[feature])
    plt.title(feature)
plt.tight_layout()
plt.show()

# correlation

In [None]:
correlation_matrix = transformed_df.corr()
mask=np.triu(np.ones_like(correlation_matrix,dtype=bool))
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True,mask=mask, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#correlation_matrix = transformed_df.corr()
#plt.figure(figsize=(15, 10))
#sns.heatmap(correlation_matrix, annot=True, cmap='Blues')
#plt.title('Correlation Heatmap')
#plt.show()

In [None]:
transformed_df.columns

In [None]:
print(transformed_df['sqft_living'].corr(transformed_df['price']))
print(transformed_df['sqft_above'].corr(transformed_df['price']))

In [None]:
print(transformed_df['sqft_living15'].corr(transformed_df['price']))
print(transformed_df['sqft_living'].corr(transformed_df['price']))

In [None]:
print(transformed_df['sqft_lot'].corr(transformed_df['price']))
print(transformed_df['sqft_lot15'].corr(transformed_df['price']))

In [None]:
#'sqft_above','sqft_living15','grade','bathrooms'
transformed_df.drop(['sqft_above','sqft_living15','sqft_lot15','date'],axis=1,inplace=True)

In [None]:
# after delete the column that have hige correlation 
correlation_matrix = transformed_df.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()

# Split The Data

In [None]:
target = 'price'
X = transformed_df.drop(columns=[target])
y = transformed_df[target]

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X ,y ,test_size=0.25 , random_state=42 )
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# LinearRegression 

In [None]:
Lrg = LinearRegression()
Lrg.fit(X_train , y_train)

In [None]:
y_pred = Lrg.predict(X_test)

In [None]:
Lrg.score(X_train , y_train)

In [None]:
Lrg.score(X_test , y_test)

In [None]:
# r2_score
r2=r2_score(y_test , y_pred)
print("Test Accuracy:", round(r2, 4))

In [None]:
# mean_absolute_error
mae=mean_absolute_error(y_test , y_pred)
print("Test Accuracy:", round(mae, 4))

In [None]:
# mean_squared_error
mean_squared_error=mean_squared_error(y_test , y_pred)
print("Test Accuracy:", round(mean_squared_error, 4))

In [None]:
# cross_val_score
cv=cross_val_score(Lrg, X, y,cv=5) 
cv

In [None]:
print("the mean of cross_val_score is ",cv.mean())

In [None]:
# liner regresion graph 
plt.scatter(y_test,y_pred, color="b")
plt.plot(y_test,y_test, color="r")
plt.title("linear Regresion");

In [None]:
# this is the importance of each fetuer  
importances = Lrg.coef_ 
features = X_test.columns
feat_imp = pd.Series(importances , index= features)
feat_imp.sort_values().tail(10).plot(kind= 'barh')
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");

# KNN

In [None]:
kn= KNN(n_neighbors=7,weights="uniform",metric="manhattan")
kn.fit(X_train , y_train)

In [None]:
# train score
kn.score(X_train,y_train)

In [None]:
y_kn_pred = kn.predict(X_test)

In [None]:
kn.score(X_test,y_test)

In [None]:
# r2_score
r2=r2_score(y_test , y_kn_pred)
print("Test Accuracy:", round(r2, 4))

In [None]:
# mean_absolute_error
mae=mean_absolute_error(y_test , y_kn_pred)
print("Test Accuracy:", round(mae, 4))

In [None]:
# cross_val_score
cv=cross_val_score(kn, X, y,cv=10) 
cv

In [None]:
print("the mean of cross_val_score is ",cv.mean())

In [None]:
# KNN graph 
plt.scatter(y_test,y_kn_pred, color="b")
plt.plot(y_test,y_test, color="r")
plt.title("KNN");

# XGBOST

In [None]:
xgb = XGBRegressor(n_estimators= 2000 , max_depth= 7 , learning_rate = 0.01)


In [None]:
xgb.fit(X_train , y_train);

In [None]:
y_xgb_pred=xgb.predict(X_test)

In [None]:
print ("train accuracy",xgb.score(X_train , y_train))
print ("test accuracy",xgb.score(X_test , y_test))

In [None]:
# r2_score
r2=r2_score(y_test , y_xgb_pred)
print("Test Accuracy:", round(r2, 4))

In [None]:
# mean_absolute_error
mae=mean_absolute_error(y_test , y_pred)
print("Test Accuracy:", round(mae, 4))

In [None]:
# cross_val_score
cv=cross_val_score(xgb, X, y,cv=10) 
cv

In [None]:
print("the mean of cross_val_score is ",cv.mean())

In [None]:
# XGB graph 
plt.scatter(y_test,y_xgb_pred, color="b")
plt.plot(y_test,y_test, color="r")
plt.title("XGBOST");

# DecisionTree

In [None]:
r_dt = DecisionTreeRegressor(random_state=42,max_depth=9)


r_dt.fit(X_train , y_train)

In [None]:
dt_train_r2=r2_score(y_train, r_dt.predict(X_train))*100
dt_test_r2 =r2_score(y_test, r_dt.predict(X_test))*100
print(f'R² score for train : {dt_train_r2}')
print(f'R² score for test: {dt_test_r2}')

In [None]:
dt_train_mean=mean_squared_error(y_train, r_dt.predict(X_train))*100
dt_test_mean=mean_squared_error(y_test, r_dt.predict(X_test))*100

print(f'mean_squared_error score train : {dt_train_mean}')
print(f'mean_squared_error score for test : {dt_test_mean}')

In [None]:
cv=cross_val_score(r_dt, X, y,cv=15) 
cv

In [None]:
print("the mean of cross_val_score is ",cv.mean())

# SVM

In [None]:
svr_reg = SVR(kernel='rbf',C=30)

svr_reg .fit(X_train , y_train)



In [None]:
svr_train_r2=r2_score(y_train, svr_reg.predict(X_train))*100
svr_test_r2=r2_score(y_test, svr_reg.predict(X_test))*100
print(f'R² score for trin : {svr_train_r2}')
print(f'R² score for test: {svr_test_r2}')

In [None]:
svr_train_mean=mean_squared_error(y_train, svr_reg.predict(X_train))*100
svr_test_mean=mean_squared_error(y_test, svr_reg.predict(X_test))*100

print(f'mean_squared_error score for train : {svr_train_mean}')
print(f'mean_squared_error score for test : {svr_test_mean}')

In [None]:
cv=cross_val_score(svr_reg, X, y,cv=3) 
cv

In [None]:
print("the mean of cross_val_score is ",cv.mean())

# ANN

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
model = Sequential()

model.add(Dense(64,activation = 'relu',input_dim=11))
model.add(Dense(64,activation = 'relu'))
model.add(Dense(1))

model.compile(optimizer = 'adam',loss = "mean_squared_error")
model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
history = model.fit(X_train, y_train, epochs=40, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

In [None]:
ann_train_r2=r2_score(y_train, model.predict(X_train))*100
ann_test_r2=r2_score(y_test, model.predict(X_test))*100

print(f'R² score for train: {ann_train_r2}')
print(f'R² score for test : {ann_test_r2}')

In [None]:
ann_train_mean=mean_squared_error(y_train, model.predict(X_train))*100
ann_test_mean=mean_squared_error(y_test, model.predict(X_test))*100

print(f'mean_squared_error score for train: {ann_train_mean}')
print(f'mean_squared_error score for test : {ann_test_mean}')

In [None]:
data = {
    'Model': ['SVM', 'Decision Tree', 'ANN'],
    'R2 Train': [svr_train_r2, dt_train_r2, ann_train_r2],
    'R2 Test': [svr_test_r2, dt_test_r2, ann_test_r2],
    'Mean Train': [svr_train_mean, dt_train_mean, ann_train_mean],
    'Mean Test':[svr_test_mean, dt_test_mean, ann_test_mean]
}

df = pd.DataFrame(data)
df

In [None]:
model.save('model_optimal.h5')