Importing all the necessary Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as  np
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

## 1. Load Data

Raw dataset is loaded in this section

In [None]:
#Loading the data and storing into a dataframe
df = pd.read_csv('dataset/dataset.csv')

In [None]:
#Sample and total features
df.shape

In [None]:
#first five rows of the dataset
df.head()

In [None]:
#all the columns in the dataset
df.columns

In [None]:
#siplay samples, datatype of each features
df.info()

In [None]:
#total nan values in each features
df.isnull().sum()

In [None]:
df.describe()

## 2. Exploratory Data Analysis

This step is necessary so understand the data

In [None]:
df.columns

In [None]:
# df['c-area'].unique()


Lets rename the columns name for ease use.

In [None]:

df = df.rename(columns ={'Suburb' : 'sub',
                    'Address' : 'add',
                    'Rooms' : 'rooms',
                    'Type' : 'type',
                    'Price' : 'price',
                    'Method' : 'sell-meth',
                    'SellerG' : 'seller',
                    'Date' : 'date',
                    'Distance' : 'dist',
                    'PostCode' : 'post-code',
                    'Bedroom2' : 'bed2',
                    'Bathroom' : 'bathroom',
                    'Car' : 'car',
                    'Landsize' : 'l-size',
                    'BuildingArea' : 'b-area',
                    'YearBuilt' : 'build-year',
                    'CouncilArea' : 'c-area',
                    'Lattitude' : 'lati',
                    'Longtitude' : 'long',
                    'Regionname' : 'reg-name',
                    'Propertycount' : 'prop-count'
} )

Drop columns which are unwanted

In [None]:
# df.drop(columns= ['date'], inplace= True)


### 2.1 Mapping Feature samples for better understanding

There are lot samples with in understandable values, so lets map them with appropriate name first

In [None]:
# # Define mapping dictionaries for both columnsasdaf

# selling_method_mapping = {
#     'SS': 'Sold Before Auction',
#     'S': 'Sold',
#     'VB': 'Vendor Bid',
#     'SP': 'Sold Prior',
#     'PI': 'Passed In',
#     'SN': 'Sold Not Disclosed',
#     'W': 'Withdrawn',
#     'PN': 'Passed In, Vendor Bid',
#     'SA': 'Sold After Auction'
# }
# # Use the map function to replace values in both columns
# df['sell-meth'] = df['sell-meth'].map(selling_method_mapping)
# # Define mapping dictionaries for both columnsasdaf
# type_mapping = {'h': 'house', 'u': 'unit', 't': 'town'}
# df['type'] = df['type'].map(type_mapping)

# # Display the updated DataFrame
# df.head()


### 2.1 Univariate Analysis

This section covers all the data analysis for single variable exploratory data analysis

##### 2.1.0 Histogram

In [None]:
import matplotlib.pyplot as plt

# Plot a histogram of your data
plt.hist(data = df, x = 'price', bins=30, density=True)
plt.show()


##### 2.1.1 Count Plot

In [None]:
#lets check all the selling method for each house
sns.countplot(data = df, x = 'sell-meth', hue = df['sell-meth'])

In [None]:
#lets check the number of rooms in each house
sns.countplot(data = df, x = 'rooms', hue = 'rooms')

In [None]:
#lets check the number of rooms in each house
sns.countplot(data = df, x = 'type', hue = 'type')


##### 2.1.2 Distribution Plot

In [None]:
sns.displot(data = df, x = 'price')
print('The price is logged.')

### 2.2 Multivariate Analysis

##### 2.2.1 Box Plot

In [None]:
# sns.boxplot(data = df, x= 'type', y = 'price')
sns.boxplot(x = df["type"], y = df['price'])

##### 2.2.2 Scatter Plot

In [None]:
sns.scatterplot(data = df, x ='rooms', y = 'price', hue = 'type') 

In [None]:
sns.scatterplot(data = df, x ='type', y = 'price', hue = 'type') 

In [None]:
sns.scatterplot(data = df, x ='dist', y = 'price', hue = 'type') 

##### 2.2.3 Encoding

In [None]:
df.info()

In [None]:
# le = LabelEncoder()
# #loading all the categorical features in this
# categorical_column = ['sub', 'add','type', 'reg-name', 'sell-meth', 'seller']
# for categories in categorical_column:
#     df[categories] = le.fit_transform(df[categories])
#     le.transform(le.classes_)

##### 2.2.4 Correlation Matrix

In [None]:
df.head()

In [None]:
le = LabelEncoder()
#loading all the categorical features in this
categorical_column = ['sub', 'add','type', 'reg-name', 'sell-meth', 'seller']
for categories in categorical_column:
    df[categories] = le.fit_transform(df[categories])
    le.transform(le.classes_)

In [None]:
df.columns


In [None]:
plt.figure(figsize = (15,12))
sns.heatmap(df.corr(), annot = True, cmap = 'coolwarm')

In [None]:
from scipy import stats

column_to_test = df['price']

# Apply the Shapiro-Wilk test
stat, p = stats.shapiro(column_to_test)

if p > 0.05:
    print("The data appears to be normally distributed.")
else:
    print("The data does not appear to be normally distributed.")


In [None]:
from scipy.stats import kurtosis, skew

kurt = kurtosis(df['price'])
skewness = skew(df['price'])

if kurt == 3 and skewness == 0:
    print("The data is normally distributed.")
else:
    print("The data may not be normally distributed.")


## 3. Feature Engineering

##### 3.1 Feature Selection

From the correlation matrix seen in 2., we can identify some important features impacting the final price of house. 

In [None]:
df['price'] = np.log(df['price'])
df['price'].head()

In [None]:
df.dropna(subset=['price'], inplace=True)


In [None]:
X = df[['rooms', 'type', 'bed2', 'bathroom', 'car', 'long']]
y = df[['price']]


##### 3.2 Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

#Pre Por


# 4. Preprocessing

##### 4.1 Null Values

In [None]:
#total nan values in Price
null_price = df['price'].isnull().sum()
print ('Total number of null values in Price is',null_price,".", "So, this requires some data preprocessing.")

In [None]:
# df.dropna(subset=['price'], inplace=True)


In [None]:
roomsnull = X_train['rooms'].isnull().sum()
bedroomnull= X_train['bed2'].isnull().sum()
bathroomnull = X_train['bathroom'].isnull().sum()
typenull = X_train['type'].isnull().sum()
longnull = X_train['long'].isnull().sum()
carnull = X_train['car'].isnull().sum()
print ('No. of nulls in is room is : ', roomsnull)
# print ('No. of nulls in is price is : ', pricenull)
print ('No. of nulls in is bedroom is : ', bedroomnull)
print ('No. of nulls in is bathroom is : ',bathroomnull )
print ('No. of nulls in is type is : ',typenull)
print ('No. of nulls in is longitude is : ',longnull )
print ('No. of nulls in is car is : ',carnull )


So, we can see a lot of nulls in the dataset. This requires some data processing. 
Lets check the target for this project. 

In [None]:
X_train['bed2'].fillna(X_train['bed2'].mode()[0], inplace=True)
X_train['bathroom'].fillna(X_train['bathroom'].mode()[0], inplace=True)
X_train['long'].fillna(X_train['long'].mode()[0], inplace=True)
X_train['car'].fillna(X_train['car'].mode()[0], inplace=True)


In [None]:
roomsnull = X_train['rooms'].isnull().sum()
bedroomnull= X_train['bed2'].isnull().sum()
bathroomnull = X_train['bathroom'].isnull().sum()
typenull = X_train['type'].isnull().sum()
longnull = X_train['long'].isnull().sum()
carnull = X_train['car'].isnull().sum()
print ('No. of nulls in is room is : ', roomsnull)
# print ('No. of nulls in is price is : ', pricenull)
print ('No. of nulls in is bedroom is : ', bedroomnull)
print ('No. of nulls in is bathroom is : ',bathroomnull )
print ('No. of nulls in is type is : ',typenull)
print ('No. of nulls in is longitude is : ',longnull )
print ('No. of nulls in is car is : ',carnull )


In [None]:
if 'price' in y_train.columns and 'price' in y_test.columns:
    # Dropping missing values in the 'price' column
    y_train['price'].dropna(inplace=True)
    y_test['price'].dropna(inplace=True)
else:
    print("The 'price' column does not exist in either y_train or y_test data frames.")


In [None]:
# from sklearn.impute import SimpleImputer

# # Initialize a SimpleImputer with strategy='median' (you can use other strategies as needed)
# imputer = SimpleImputer(strategy='median')

# # Fit and transform the imputer on the target variable
# y_train = imputer.fit_transform(y_train)


##### 4.2 Checking outliers

In [None]:
# # Create a dictionary of columns.
# col_dictc = {'type':1
           
#             }

# # Detect outliers in each variable using box plots.
# plt.figure(figsize=(20,30))

# for variable,i in col_dictc.items():
#                      plt.subplot(5,4,i)
#                      plt.boxplot(X_train[variable])
#                      plt.title(variable)

# plt.show()

In [None]:
# def outlier_count(col, data = X_train):
    
#     # calculate your 25% quatile and 75% quatile
#     q75, q25 = np.percentile(data[col], [75, 25])
    
#     # calculate your inter quatile
#     iqr = q75 - q25
    
#     # min_val and max_val
#     min_val = q25 - (iqr*1.5)
#     max_val = q75 + (iqr*1.5)
    
#     # count number of outliers, which are the data that are less than min_val or more than max_val calculated above
#     outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])
    
#     # calculate the percentage of the outliers
#     outlier_percent = round(outlier_count/len(data[col])*100, 2)
    
#     if(outlier_count > 0):
#         print("\n"+15*'-' + col + 15*'-'+"\n")
#         print('Number of outliers: {}'.format(outlier_count))
#         print('Percent of data that is outlier: {}%'.format(outlier_percent))
# for col in X_train.columns:
#     outlier_count(col)

##### 4.3 Scalling

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
# Shape check for X_train, X_test, y_train, y_test before model fitting
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

## 5. Modelling

In [None]:
# Import the algorithm libaries to be tried out
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Listing out the algorithms to be tried out
algorithms = [
    LinearRegression(),
    SVR(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(random_state = 0),
    RandomForestRegressor(n_estimators = 200, random_state = 42),
    XGBRegressor(n_estimators = 200, random_state = 42)
]

algorithm_names = [
    "Linear Regression",
    "SVR",
    "KNeighbors Regressor",
    "Decision-Tree Regressor",
    "Random-Forest Regressor",
    "XGBregressor"
]

In [None]:
print(y_train.isnull().sum())
print(y_test.isnull().sum())

In [None]:
# y_train.dropna(subset=['price'], inplace=True)
# y_test.dropna(subset=['price'], inplace=True)



In [None]:
print(y_train.isnull().sum())
print(y_test.isnull().sum())

In [None]:
# Import library KFold and cross_val_score
# These libaries will cross validate the best scores between algorithms and result mean of best score for each algorith
from sklearn.model_selection import KFold, cross_val_score

train_mse = []
test_mse = []

# KFold split is set to 10, hence the mean of ten scores will be taken
kfold = KFold(n_splits = 10, shuffle=True)

# Looping each algorithm for cross validation using training data
# The scoring is set to best negative mean squared error
for i, model in enumerate(algorithms):
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
    
    print(f"{algorithm_names[i]} - Score: {scores}; Mean: {scores.mean()}")

In [None]:
best_score = float('-inf')  # Initialize with negative infinity
mean_score = scores.mean()
print(f"{algorithm_names[i]} - Score: {scores}; Mean: {mean_score}")
    
if mean_score > best_score:
        best_score = mean_score
        best_model_index = i

best_model = algorithms[best_model_index]
print(f"The best model is {algorithm_names[best_model_index]} with a mean score of {best_score}")


In [None]:
# Importing Grid Search Library
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
# Grid Search takes in an algorithm and applies different values to hyperparaemeters and computes resulst for each permutation of parameter values
from sklearn.model_selection import GridSearchCV

# Defining the values for parameters
param_grid = {
    'max_depth': [5, 10, None],
    'n_estimators': [100, 200],
    'learning_rate': [0.1]
}

xgb = XGBRegressor(random_state = 42)

# Initalize grid for XBGRegressor algorithm with above parameters values
# The scoring is to best negative mean squared error
grid = GridSearchCV(
    estimator = xgb,
    param_grid = param_grid,
    cv = kFold,
    n_jobs = -1,
    return_train_score=True,
    refit=True,
    scoring='neg_mean_squared_error'
)

# The training set is fitted to the above grid
grid.fit(X_train, y_train)

In [None]:
# Check the best parameters values while training the model
grid.best_params_

In [None]:
# Check the best mse value from the grid
# The best mse is result of applying the best paramater values in XGBRegressor algorithm
best_mse = grid.best_score_
best_mse

## Testing

In [None]:
from sklearn.metrics import mean_squared_error

yhat = grid.predict(X_test)

mean_squared_error(yhat, y_test)

## Feature Analysis

xgb_best_estimator = grid.best_estimator_

# Extracting the feature importance scores from the grid
xgb_best_estimator.feature_importances_

# Bar plot for the features and thier importance
plt.barh(X.columns, xgb_best_estimator.feature_importances_)
plt.xlabel("XGB Regressor Feature Importance")

## Inference

In [None]:
# Importing pickle library
import pickle

# Exporting the model to selling-price.model
filename = '../model/selling-price.model'
pickle.dump(grid, open(filename, 'wb'))

In [None]:
# We will also dump the scaler values for future use
scaler_filename = '../model/scaler.pkl'
pickle.dump(scaler, open(scaler_filename, 'wb'))

In [None]:
# Importing the model
selling_price_model = pickle.load(open("../model/selling-price.model", "rb"))

# Creating a dummy sample
sample = {
    "max_power": [100],
    "engine": [1200],
    "mileage": [23]
}

# Convert the sample to panda dataframe
sample = pd.DataFrame(sample)

# Scale the sample using the same scaler used for X_train and X_set
scaled_sample = scaler.transform(sample)

# Use the model to predict the selling price
predicted_selling_price = selling_price_model.predict(scaled_sample)

# As the we have log transformed the y while training and set, we will need to exponent transform the predicted value for correct prediction
predicted_selling_price = np.exp(predicted_selling_price)

print("The predicted selling price is " + str(predicted_selling_price[0]))