In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression from sklearn.linear_model import SGDRegressor

In [None]:
dataset = pd.read_csv('Melbourne_housing_dataset.csv')

In [None]:
dataset

In [None]:
dataset.shape

In [None]:
dataset.describe()

In [None]:
all_data_na = (dataset.isnull().sum() / len(dataset)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_value s(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data

In [None]:
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
dataset = dataset[~ dataset.Price.isnull() ] ## Eliminated recores with "Price" null

In [None]:
dataset.reset_index(drop=True, inplace=True)

In [None]:
from sklearn.model_selection import ShuffleSplit
shuffleSplit = ShuffleSplit(n_splits=1,test_size = 0.2 , random_state=42)

for train_index, test_index in shuffleSplit.split(dataset):
  training_set = dataset.loc[train_index]
  test_set = dataset.loc[test_index]

In [None]:
training_set.isnull().any()

In [None]:
training_set = training_set[~ training_set['Postcode'].isnull()]

In [None]:
test_set = test_set[~ test_set['Postcode'].isnull()]

In [None]:
training_set.drop(["Address", "CouncilArea", "Regionname", "Lattitude", "Subur b", "Longtitude","Type","Method","SellerG","Date"], axis=1, inplace=True)

In [None]:
test_set.drop(["Address", "CouncilArea", "Regionname", "Lattitude", "Suburb", "Longtitude","Type","Method","SellerG","Date"], axis=1, inplace=True)

In [None]:
training_set.select_dtypes(['float64','int64']).isnull().any()

In [None]:
training_set.Bedroom2.fillna(value=training_set.Bedroom2.mean(), inplace=True)
training_set.Bathroom.fillna(value=training_set.Bathroom.mode()[0], inplace=True)
training_set.Car.fillna(value=training_set.Car.median(), inplace=True)
training_set.fillna(value= training_set.mean()[["BuildingArea", "YearBuilt", "Propertycount"]], inplace=True)
training_set["Landsize_log"] = np.log(training_set[~training_set.Landsize.isnu ll() & training_set.Landsize > 0]['Landsize'])
Landsize_log_mean = training_set["Landsize_log"].mean()
training_set["Landsize_log"].fillna(value=Landsize_log_mean, inplace=True)
training_set["Landsize_log"] = training_set["Landsize_log"].apply(lambda x: La ndsize_log_mean if x == 0 else x)
training_set.drop('Landsize', axis=1, inplace=True)

In [None]:
test_set.Bedroom2.fillna(value=test_set.Bedroom2.mean(), inplace=True)
test_set.Bathroom.fillna(value=test_set.Bathroom.mode()[0], inplace=True)
test_set.Car.fillna(value=test_set.Car.median(), inplace=True)
test_set.fillna(value= test_set.mean()[["BuildingArea", "YearBuilt", "Property count"]], inplace=True)
test_set["Landsize_log"] = np.log(test_set[~test_set.Landsize.isnull() & test_ set.Landsize > 0]['Landsize'])
Landsize_log_mean = test_set["Landsize_log"].mean()
test_set["Landsize_log"].fillna(value=Landsize_log_mean, inplace=True)
test_set["Landsize_log"] = test_set["Landsize_log"].apply(lambda x: Landsize_l og_mean if x == 0 else x)
test_set.drop('Landsize', axis=1, inplace=True)

In [None]:
training_set.isnull().any()

In [None]:
plt.figure(figsize=(14,6))
dataset.plot(x='Lattitude', y='Longtitude', style='o')
plt.title('Lattitude vs Longitutude')
plt.xlabel('Lattitude')
plt.ylabel('Longtitude')
plt.show()

In [None]:
plt.figure(figsize=(14,6))
plt.tight_layout()
plt.title('Variations in Price')
sns.distplot(dataset['Price'])

In [None]:
plt.figure(figsize=(14,6)) # Add title
plt.title("Features Behaviour")
sns.lineplot(data=training_set['Rooms'], label="Rooms")
sns.lineplot(data=training_set['Bathroom'], label="Bathroom")
sns.lineplot(data=training_set['Bedroom2'], label="Bedroom2")
sns.lineplot(data=training_set['Car'], label="Car")

In [None]:
plt.figure(figsize=(14,6))
training_set.plot.scatter(x='Rooms', y='Price')
plt.title('Relation with Rooms')

In [None]:
plt.figure(figsize=(14,6))
sns.scatterplot(x=training_set['YearBuilt'], y=training_set['Price'], hue=trai
ning_set['Rooms'])
plt.title('Relation with YearBuilt')

In [None]:
input_features = [x for x in training_set.columns if x not in ['Price']]
input_features1 = [x for x in test_set.columns if x not in ['Price']]

In [None]:
X_train = training_set[input_features].values
y_train = training_set['Price'].values
X_test = test_set[input_features].values
y_test = test_set['Price'].values

In [None]:
lr = LinearRegression()
lr_model = lr.fit(X_train,y_train)

In [None]:
y_train_pred = lr_model.predict(X_train)

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_train, y_train_pred)
print("Score using Linear Regression : %f " %(r2))

In [None]:
test_set.isnull().any()

In [None]:
y_test_pred = lr_model.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_test_pred)
print("Score using Linear Regression : %f " %(r2))

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
  model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
  model.fit(X_train,y_train)
  preds = model.predict(X_train)
  r2 = r2_score(y_train, preds)
  print("Score using DecisionTreeRegressor : %f " %(r2))
  mae = mean_absolute_error(y_train, preds)
  return(mae)

for max_leaf_nodes in [5, 50, 500, 5000]:
  my_mae = get_mae(max_leaf_nodes, X_train,X_test,y_train,y_test)
  print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

In [None]:
model = DecisionTreeRegressor(max_leaf_nodes=500, random_state=0)
model.fit(X_train,y_train)
preds_val = model.predict(X_test)
r2 = r2_score(y_test, preds_val)
print("Score using DecisionTreeRegressor : %f " %(r2))
mae = mean_absolute_error(y_test, preds_val)
print(" Mean Absolute Error: %d" %(mae))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=4, n_estimators=300, learning_rate= 0.1, random_state=42)
gbrt.fit(X_train, y_train)
y_pred_gbrt = gbrt.predict(X_train)

In [None]:
r2 = r2_score(y_train, y_pred_gbrt)
print("Score using GradientBoostingRegressor : %f " %(r2))

In [None]:
y_test_pred_gbrt = gbrt.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_test_pred_gbrt)
print("Score using GradientBoostingRegressor : %f " %(r2))
print("Mean Absolute Error: " + str(mean_absolute_error( y_test_pred_gbrt, y_test)))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'max_depth':[6,7,8],
     'n_estimators':[300, 350],
     'learning_rate':[0.09, 0.1, 0.11, 0.12]} ]

grd_gbr_model = GradientBoostingRegressor(random_state=15)
grid_search = GridSearchCV(grd_gbr_model, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_y_pred_gbrt = grid_search.best_estimator_.predict(X_train)

In [None]:
r2 = r2_score(y_train, best_y_pred_gbrt)
print(r2)

In [None]:
best_y_test_pred_gbrt = grid_search.best_estimator_.predict(X_test)

In [None]:
r2 = r2_score(y_test, best_y_test_pred_gbrt)
print(r2)

In [None]:
y_train_log = np.log(y_train)

In [None]:
y_test_log = np.log(y_test)

In [None]:
from sklearn.model_selection import GridSearchCV param_grid = [
    {'max_depth':[6,7],
     'n_estimators':[300],
     'learning_rate':[0.1, 0.11]} ]
grd_gbr_model = GradientBoostingRegressor(random_state=15)
grid_search = GridSearchCV(grd_gbr_model, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train_log)

In [None]:
grid_search.best_params_

In [None]:
best_y_pred_gbrt_log = grid_search.best_estimator_.predict(X_train)

In [None]:
r2 = r2_score(y_train_log, best_y_pred_gbrt_log)
print("Score using GradientBoostingRegressor : %f " %(r2))

In [None]:
best_y_test_pred_gbrt_log = grid_search.best_estimator_.predict(X_test)

In [None]:
r2 = r2_score(y_test_log, best_y_test_pred_gbrt_log)
print("Score using GradientBoostingRegressor : %f " %(r2))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt1 = GradientBoostingRegressor(max_depth=3, n_estimators=500, learning_rate =0.4, random_state=45, min_samples_split= 2)
gbrt1.fit(X_train, y_train)
y_pred_gbrt = gbrt1.predict(X_train)

In [None]:
r2 = r2_score(y_train, y_pred_gbrt)
print("Score using GradientBoostingRegressor : %f " %(r2))
print("Mean Absolute Error: " + str(mean_absolute_error( y_pred_gbrt, y_train )))

In [None]:
y_test_pred_gbrt = gbrt1.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_test_pred_gbrt)
print("Score using GradientBoostingRegressor : %f " %(r2))
print("Mean Absolute Error: " + str(mean_absolute_error( y_test_pred_gbrt, y_test)))

In [None]:
print("Prediction:")
X_new=[[5,25,3500,3,2,2,30000,2016,10000,8.9]]
ynew=gbrt1.predict(X_new)
print(ynew)