In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [26]:
df = pd.read_csv('data_clean.csv')

In [27]:
pd.set_option('display.max_columns', None)

In [28]:
# Split Data 
X = df.iloc[:,1:].values
y = df.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [29]:
# Feature Scaling 
sc_X = StandardScaler()
X_train[:, 0:5] = sc_X.fit_transform(X_train[:, 0:5])
X_test[:, 0:5] = sc_X.transform(X_test[:, 0:5])
X_train[:, 6:11] = sc_X.fit_transform(X_train[:, 6:11])
X_test[:, 6:11] = sc_X.transform(X_test[:, 6:11])

sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train.reshape(-1,1)).flatten()


Regression Models 

In [39]:
# Linear Regression 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=2)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print("Training data accuracy",lr_model.score(X_train, y_train))
print("Testing data accuracy",lr_model.score(X_test, y_test))

Training data accuracy 0.5137519046688466
Testing data accuracy 0.516114872467436


In [31]:
# Decision Tree
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

[ 295000. 1034500.  839000. ...  215000. 1415000.  520000.]
Mean Squared Error: 95727855608.96039


In [32]:
# Random Forest - Training 
random_forest = RandomForestRegressor(n_estimators= 1000, random_state= 0)
random_forest.fit(X_train, y_train)

In [33]:
# Random Forest - Evaluate model 
y_prediction = random_forest.predict(X_test)
y_train_prediction = random_forest.predict(X_train)
r2_frst_train = r2_score(y_train, y_train_prediction)
r2_frst_test = r2_score(y_test, sc_y.inverse_transform(y_prediction.reshape(-1,1)))
print("R2 Train Score:", r2_frst_train)
print("R2 Test Score:", r2_frst_test)
mse_frst_train = mean_squared_error(y_train, y_train_prediction)
mse_frst_test = mean_squared_error(y_test, sc_y.inverse_transform(y_prediction.reshape(-1,1)))
print("Mean Squared Error of Train:", mse_frst_train)
print("Mean Squared Error of Test:", mse_frst_test)

R2 Train Score: 0.9340873145431117
R2 Test Score: -358179533957.052
Mean Squared Error of Train: 6678088710.146646
Mean Squared Error of Test: 3.948914889513291e+22


In [34]:
random_forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [35]:
random_forest = RandomForestRegressor(max_depth= None, 
                                      min_samples_leaf=5, 
                                      min_samples_split=6, 
                                      n_estimators= 1200, 
                                      n_jobs=1)

In [36]:
random_forest.fit(X_train, y_train)

In [37]:
import pickle

In [38]:
Housing_Price = 'random_forest.sav'
pickle.dump(random_forest, open(Housing_Price, 'wb'))