<a href="https://colab.research.google.com/github/ser-kostas/Python-Machine-Learning/blob/main/house_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'simple-housing-price-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F62928%2F6856479%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240223%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240223T145852Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5372ff71dbaf97155701819ff0fbb37f4de43f9dc7c2f82d134c477023d605b6b716fc58daff38f6b22fc447b7d3a5136d711308dba46c8df35e6f78cf00cfcd66833238f58ea9580f1e119b3b5574b4ff8132f38ab90379f05558543930325e1edd6757a317ce7e8f1451e47acc667fd5c346a67e710346b1a945ea04271763712a379047bcbc43eaf9405ab1fca7efa918d87c1850b921b51ba11c53b3160c5acc3575a021b8c9e6437a34490a78f38965e858079b8b290cb970afbaf1b5a3da2e4f621ad7c184eb9eb5634789b87b2044e014b111627696de6b284efc3c25f22ca2dd60ece7cfd188490df9d70d9b52af75f8e504517bdcac40bd91b06d7c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# Standard
import numpy as np
import pandas as pd

# Feature engineering
from sklearn.preprocessing import StandardScaler

# Machine learning
from sklearn import linear_model
from sklearn.model_selection import train_test_split

# Model evaluation
from sklearn.metrics import mean_absolute_error

# Utility
import random
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Reading the data from the csv files
sample_data = pd.read_csv('/kaggle/input/simple-housing-price-prediction/sample_submission.csv')
train_set = pd.read_csv('/kaggle/input/simple-housing-price-prediction/train.csv')
test_set = pd.read_csv('/kaggle/input/simple-housing-price-prediction/test.csv')

In [None]:
def feature_engineering(df, droppable_columns, numerical_columns, categorical_columns):
    # Drop irrelevant columns
    df = df.drop(droppable_columns, axis=1)

    # Standardization
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    return df

In [None]:
droppable_columns = ['house_id', 'date', 'block', 'storey_range', 'street']
numerical_columns = ['area_sqm', 'commence_date', 'area_sqm']
categorical_columns = ['location', 'type', 'flat_model']
train_fe = feature_engineering(train_set, droppable_columns, numerical_columns, categorical_columns)

In [None]:
train_fe

In [None]:
linreg = linear_model.LinearRegression()

# Spliting the test set
train_df, val_df = train_test_split(train_fe, test_size=0.2)

# Setting the target value
target = 'price'

X_train = train_df.drop([target], axis=1)
y_train = train_df[target]

X_val = val_df.drop([target], axis=1)
y_val = val_df[target]

# Running the model
linreg.fit(X_train, y_train)

y_val_preds = linreg.predict(X_val)

In [None]:
# Calculating the mean absolute error to validate the prediction
mae = mean_absolute_error(y_val, y_val_preds)
print(f"Mean Absolute Error {mae: ,}")

In [None]:
# Trying a new prediction model
from sklearn.ensemble import RandomForestRegressor

#Split data as before
train_df, val_df = train_test_split(train_fe, test_size=0.2)

# Setting the target value
target = 'price'

X_train = train_df.drop([target], axis=1)
y_train = train_df[target]

X_val = val_df.drop([target], axis=1)
y_val = val_df[target]

# Create a randomForest model
rf = RandomForestRegressor()

In [None]:
# Fit the model
rf.fit(X_train, y_train)

#Make predictions
y_pred_rf = rf.predict(X_val)

# Calculating the mean absolute error to validate the prediction
mae = mean_absolute_error(y_val, y_pred_rf)
print(f"Mean Absolute Error {mae: ,}")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_set['price'], bins=20)

In [None]:
plt.scatter(train_set['area_sqm'],train_set['price'])
plt.xlabel('Area')
plt.ylabel('Price')
plt.show()

In [None]:
# Use SelectKBest to find out what indexes is better to use
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, f_classif

# Spliting the test set
train_df, val_df = train_test_split(train_fe, test_size=0.2)

k_best = SelectKBest(score_func=f_regression, k=2)
X_train = train_df.drop([target], axis=1)
y_train = train_df[target]



X_val = val_df.drop([target], axis=1)
y_val = val_df[target]

X_train_KB = k_best.fit(X_train, y_train)
y_train_KB = k_best.fit(X_val, y_val)

# Running the model
linreg.fit(X_train, y_train)

y_val_preds = linreg.predict(X_val)

# Calculating the mean absolute error to validate the prediction
mae = mean_absolute_error(y_val, y_val_preds)
print(f"Mean Absolute Error {mae: ,}")