In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [16]:
df = pd.read_csv('data_clean.csv')

In [17]:
# Split Data 
X = df.drop('price', axis =1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [18]:
print(X_test.shape)

(873, 4)


In [19]:
print(y_test.shape)

(873,)


Regression Models 

In [20]:
# Linear Regression 

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
print(f'Mean Squared Error: {mse_linear}')
print(f'R-squared: {r2_linear}')

Mean Squared Error: 47013195643.10566
R-squared: 0.4477029798470272


In [21]:
# Decision Tree

dec_tree_model = DecisionTreeRegressor()
dec_tree_model.fit(X_train, y_train)
y_pred_decision = dec_tree_model.predict(X_test)
mse_decision = mean_squared_error(y_test, y_pred_decision)
r2_decision = r2_score(y_test, y_pred_decision)
print(f'Mean Squared Error: {mse_decision}')
print(f'R-squared: {r2_decision}')

Mean Squared Error: 97732229719.64081
R-squared: -0.1481291264866189


In [22]:
# Random Forest - Training 
random_forest = RandomForestRegressor(random_state= 42)
random_forest.fit(X_train, y_train)

In [23]:
params = random_forest.get_params()
params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [24]:
# Set new parameters 
random_forest.set_params = {
    'n_estimators': 100,
    'max_depth': 10, 
    'min_samples_leaf': 2,
    'min_samples_split': 5
}

In [25]:
random_forest.fit(X_train, y_train)

In [26]:
# Random Forest - Evaluate model 
y_pred_forest = random_forest.predict(X_test)
mse_forest = mean_squared_error(y_test, y_pred_forest)
r2_forest = r2_score(y_test, y_pred_forest)

print(f'Mean Squared Error: {mse_forest}')
print(f'R-squared: {r2_forest}')

Mean Squared Error: 55406750680.03104
R-squared: 0.34909799518324436


In [27]:
import pickle

In [28]:
with open('linear_model.pkl', 'wb') as model_file:
    pickle.dump(linear_model, model_file)