<a href="https://colab.research.google.com/github/salahhesham01/House-price-prediction/blob/main/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part1

##1. Data Exploration

In [None]:
# use pandas profilling
!pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
import math, copy

In [None]:
from google.colab import drive
drive.mount('/content/drive')
original_data = pd.read_csv('/content/drive/MyDrive/Assignment1/house-prices-advanced-regression-techniques/train.csv')

In [None]:
copy_data = original_data.copy()

In [None]:
report = ProfileReport(copy_data, minimal=True)
report

##2. Data Cleaning

In [None]:
# check and deal with all the missing values in the data
# check and deal with duplicated rows in the data
# check and deal with outliers
# Remove any variable that is uniformally distributed

In [None]:
copy_data

In [None]:
# check and deal with all the missing values in the data
copy_data = copy_data.drop(['Alley','PoolQC','Fence' , 'MiscFeature'],axis = 1)
for col in copy_data.columns:
    if copy_data[col].dtype in ['int64', 'float64']:
        copy_data[col].fillna(copy_data[col].mean(), inplace=True)
    elif copy_data[col].dtype == 'category':
        copy_data[col].fillna(copy_data[col].mode(), inplace=True)

In [None]:
copy_data.drop(columns=['Id'], inplace=True)

In [None]:
# check and deal with duplicated rows in the data
copy_data = copy_data.drop_duplicates(keep=False)

In [None]:
# Remove any variable that is uniformally distributed
for col in copy_data.columns:
    if copy_data[col].nunique() <= 1:
       copy_data.drop(col, axis=1, inplace=True)

##3. Feature Engineering

In [None]:
# create at least one new feature and check it's relationship with the target variable

In [None]:
def get_season(row):
  if row.MoSold in range (1,3):
    return 'Winter'
  elif row.MoSold in range(3,6):
    return 'Spring'
  elif row.MoSold in range (6,9):
    return 'Summer'
  elif row.MoSold in range (9,12):
    return 'Fall'
  else:
    return 'Winter'
copy_data['Season'] = copy_data.apply(get_season, axis=1)
copy_data['Season']

In [None]:
# create at least one new feature and check it's relationship with the target variable
copy_data['HouseAge']=copy_data['YrSold']-copy_data['YearBuilt']
copy_data['HouseAge']

##4. Data Preprocessing

In [None]:
# encode catigorical variables with one hot encoding or label encoding based on the type of the variable

ord_col=['FireplaceQu','ExterQual','ExterCond','BsmtQual', 'BsmtCond','HeatingQC','KitchenQual','GarageQual','GarageCond']
# one hot encoding to nominal
for col in copy_data.columns:
    if copy_data[col].dtype == 'object' and col not in ord_col:
        one_hot = pd.get_dummies(copy_data[col], prefix=col)
        copy_data = copy_data.drop(col, axis=1)
        copy_data = copy_data.join(one_hot)
#label encoding to ordinal
le = LabelEncoder()
for col in ord_col:
    copy_data[col] = le.fit_transform(copy_data[col])

In [None]:
# use minmax scaler or standarad scaler to make all numerical variables within the same range
normalizer = MinMaxScaler()
numerical_cols = copy_data.select_dtypes(include=['float64', 'int64']).columns
copy_data[numerical_cols] = normalizer.fit_transform(copy_data[numerical_cols])

In [None]:
# try to make at least one variable that is not following normal distribution to be normally distributed
copy_data['YearBuilt'] = copy_data['YearBuilt'] + 1
copy_data['YearBuilt'], _ = boxcox(copy_data['YearBuilt'])

#Part2

##5. Training And Evaluation

In [None]:
# split your train data to be 80% training and 20% validation
# use l2 regularization to reduce overfitting
# use early stopping to reduce overfitting
# train with at least 2 different learning rates and decide which experiment was better
# print the validation root meen square error after finishing the training with the best model

In [None]:
# split your train data to be 80% training and 20% validation
X_train = copy_data.drop('SalePrice',axis=1)

In [None]:
y_train = copy_data['SalePrice']

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and validation sets with shuffling
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col_name in list(X_train.columns.values):
  X_train[col_name] = scaler.fit_transform(X_train[[col_name]])
  X_val[col_name] = scaler.transform(X_val[[col_name]])

In [None]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_val = X_val.to_numpy()
y_val = y_val.to_numpy()

In [None]:
def compute_cost_linear_reg(X, y, w, b, lambda_):

    m  = X.shape[0]
    n  = len(w)
    cost = 0.
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b
        cost = cost + (f_wb_i - y[i])**2
    cost = cost / (2 * m)

    reg_cost = 0
    for j in range(n):
        reg_cost += (w[j]**2)
    reg_cost = (lambda_/(2*m)) * reg_cost
    total_cost = cost + reg_cost
    return total_cost

In [None]:
def regularized_gradient_function(X, y, w, b, lambda_):
    m,n = X.shape
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):
        err = (np.dot(X[i], w) + b) - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * X[i, j]
        dj_db = dj_db + err
    dj_dw = dj_dw / m
    dj_db = dj_db / m

    for j in range(n):
        dj_dw[j] = dj_dw[j] + (lambda_/m) * w[j]

    return dj_db, dj_dw

In [None]:
def regularized_gradient_descent_with_early_stopping(X_train, y_train, X_val,y_val, w_in, b_in, cost_function, gradient_function, alpha, num_iters,lambda_, early_stopping_iters):
    """
    Performs batch gradient descent to learn theta. Updates theta by taking
    num_iters gradient steps with learning rate alpha and regularization lambda
    """

    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history_train = []
    J_history_validation = []
    w_histroy = []
    b_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    counter = 0              #Counting the number of consective iterations that the validation cost is not improving in them
    min_validation_cost = 10000000 #adding initial very big number

    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = regularized_gradient_function(X_train, y_train, w, b, lambda_)

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        # Save cost J at each iteration
        train_cost = compute_cost_linear_reg(X_train, y_train, w, b, lambda_)
        J_history_train.append(train_cost)

        validation_cost = compute_cost_linear_reg(X_val, y_val, w, b, lambda_)
        J_history_validation.append(validation_cost)
        if validation_cost >= min_validation_cost:
          counter += 1
          if counter == early_stopping_iters:
            print("Early Stopping Reached In Iteration:",i)
            break
        else:
          min_validation_cost = validation_cost
          counter = 0



        w_histroy.append(w)
        b_history.append(b)
        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% 100 == 0:
            print(f"Iteration {i:4d} -- Train-Cost {J_history_train[-1]:8.2f}   -- Validation-Cost {J_history_validation[-1]:8.2f}  ")

    return w_histroy[np.argmin(J_history_validation)], b_history[np.argmin(J_history_validation)], J_history_validation #return best w,b and J history for graphing

In [None]:
initial_b =200
num_features = X_train.shape[1]
initial_w = np.zeros(num_features)
# some gradient descent settings
num_iters = 10000
alpha = 0.01
lambda_ = 0.001  #It's common to use lambda value from 0 to 0.1
early_stopping_iters = 10

# run gradient descent
w_final, b_final, J_hist = regularized_gradient_descent_with_early_stopping(X_train, y_train, X_val,y_val, initial_w, initial_b, compute_cost_linear_reg, regularized_gradient_function, alpha, num_iters,lambda_, early_stopping_iters)

In [None]:
def predict(X, w, b):

    m  = X.shape[0]
    y_pred = np.zeros((m,))
    for i in range(m):
        y_pred[i] = np.dot(X[i], w) + b
    return y_pred

In [None]:
print(f"X: {X_val[0]}")
print(f"W {w_final}")
print(f"b {b_final}")

# make a prediction
f_wb = predict(X_val,w_final,b_final)
print(f"Prediction: {f_wb[0]}")

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Make predictions on the validation set
y_val_pred = predict(X_val, w_final, b_final)

# Calculate RMSE
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)

print(f'Validation RMSE: {rmse}')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Assignment1/house-prices-advanced-regression-techniques/test.csv')
test_data=data.copy()

In [None]:
# Check and deal with missing values
test_data = test_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
for col in test_data.columns:
    if test_data[col].dtype in ['int64', 'float64']:
        test_data[col].fillna(test_data[col].mean(), inplace=True)
    elif test_data[col].dtype == 'category':
      test_data[col].fillna(test_data[col].mode(), inplace=True)

test_data.drop(columns=['Id'], inplace=True)
# Check and deal with duplicated rows
test_data = test_data.drop_duplicates(keep=False)

# Remove any variable that is uniformly distributed
for col in test_data.columns:
   if test_data[col].nunique() <= 1:
       test_data.drop(col, axis=1, inplace=True)

# Feature Engineering
test_data['HouseAge'] = test_data['YrSold'] - test_data['YearBuilt']

# Data Preprocessing
for col in test_data.columns:
    if test_data[col].dtype == 'object' and col not in ord_col:
        one_hot = pd.get_dummies(test_data[col], prefix=col)
        test_data = test_data.drop(col, axis=1)
        test_data = test_data.join(one_hot)

for col in ord_col:
    test_data[col] = le.transform(test_data[col])

# Try to make at least one variable that is not following a normal distribution to be normally distributed
test_data['YearBuilt'] = test_data['YearBuilt'] + 1
test_data['YearBuilt'], _ = boxcox(test_data['YearBuilt'])

X_test = test_data

# Standardize numerical features
for col_name in list(X_test.columns.values):
    X_test[col_name] = scaler.fit_transform(X_test[[col_name]])

In [None]:
print(f'Validation RMSE: {rmse}')

##6.Submission

In [None]:
# predict on the test set, create a csv file and submit on kaggle

In [None]:
missing_cols = set(copy_data.columns) - set(X_test.columns)

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0

# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[copy_data.columns]

In [None]:
# Convert to NumPy arrays
X_test = X_test.to_numpy()

In [None]:
X_test = np.delete(X_test, -1, axis=1)  # This will remove the last column


In [None]:
y_test_pred = predict(X_test, w_final, b_final)

In [None]:
submission = pd.DataFrame({
    "Id": test_data.index+1461,  # Assuming the test data has an 'Id' column
    "SalePrice": y_test_pred
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)