In [235]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Normalizer, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score

In [236]:
data = pd.read_csv('craiglist_clean.csv')

In [237]:
labelencoder = LabelEncoder()
data[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = data[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)

In [238]:
y = data.price
X = data.drop('price', axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [183]:
#train = pd.read_csv('craigslist_train.csv')

In [184]:
#train[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = train[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)

In [185]:
#y_train = train.price

In [186]:
#x_train = train.drop('price', axis=1)

In [187]:
#test = pd.read_csv('craigslist_test.csv')

In [188]:
#test[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = test[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)

In [189]:
#y_test = test.price

In [190]:
#x_test = test.drop('price', axis=1)

In [232]:
mmsc = MinMaxScaler()   
x_train= mmsc.fit_transform(x_train)
x_test= mmsc.transform(x_test)

In [192]:
ridgereg = Ridge()
ridgereg.fit(x_train, y_train)
y_pred_ridgereg = ridgereg.predict(x_test)
print('Ridge Regression:')
print('MAE:', round(metrics.mean_absolute_error(y_test, y_pred_ridgereg),2))
print('MSE:', round(metrics.mean_squared_error(y_test, y_pred_ridgereg),2))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridgereg)),2))
ridgereg.score(x_test,y_test)

Ridge Regression:
MAE: 11644.18
MSE: 791284419.2
RMSE: 28129.78


0.05798564367308312

In [193]:
lassoreg = Lasso()
lassoreg.fit(x_train, y_train)
y_pred_lassoreg = lassoreg.predict(x_test)
print('Lasso Regression:')
print('MAE:', round(metrics.mean_absolute_error(y_test, y_pred_lassoreg),2))
print('MSE:', round(metrics.mean_squared_error(y_test, y_pred_lassoreg),2))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred_lassoreg)),2))
lassoreg.score(x_test,y_test)

Lasso Regression:
MAE: 11640.89
MSE: 791165297.88
RMSE: 28127.66


0.05812745613592629

In [194]:
linreg = LinearRegression(n_jobs=-1)
linreg.fit(x_train, y_train)
y_pred_linreg= linreg.predict(x_test)
print('Linear Regression:')
print('MAE:', round(metrics.mean_absolute_error(y_test, y_pred_linreg),2))
print('MSE:', round(metrics.mean_squared_error(y_test, y_pred_linreg),2))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred_linreg)),2))
print('R-squared:', linreg.score(x_test, y_test))

Linear Regression:
MAE: 11639.06
MSE: 791074506.9
RMSE: 28126.05
R-squared: 0.05823554167716449


In [195]:
degree=2
polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyreg.fit(x_train, y_train)
y_pred_polreg = polyreg.predict(x_test)
print('Polynomial Regression:')
print('MAE:', round(metrics.mean_absolute_error(y_test, y_pred_polreg),2))
print('MSE:', round(metrics.mean_squared_error(y_test, y_pred_polreg),2))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred_polreg)),2))
print('r2: ', round(metrics.r2_score(y_test,y_pred_polreg), 4))

Polynomial Regression:
MAE: 11933.74
MSE: 838565306.32
RMSE: 28957.99
r2:  0.0017


In [242]:
regressor = RandomForestRegressor(random_state=123)
regressorModel = regressor.fit(x_train, y_train)
y_pred_r = regressorModel.predict(x_test)
mae_r = metrics.mean_absolute_error(y_test, y_pred_r)
mse_r = metrics.mean_squared_error(y_test, y_pred_r)
rmse_r = np.sqrt(mse_r)
r2_score_r = regressorModel.score(x_test, y_test)
print(f"MAE: {round(mae_r,2)} \nMSE: {round(mse_r,2)} \nRMSE: {round(rmse_r,2)} \nr2_score: {round(r2_score_r,2)}")

MAE: 7854.93 
MSE: 5624042465.8 
RMSE: 74993.62 
r2_score: -5.7


In [255]:
reg1 = RandomForestRegressor(n_estimators=30, random_state=24)
regressorModel = reg1.fit(x_train, y_train)
y_pred_r = regressorModel.predict(x_test)
mae_r = metrics.mean_absolute_error(y_test, y_pred_r)
mse_r = metrics.mean_squared_error(y_test, y_pred_r)
rmse_r = np.sqrt(mse_r)
r2_score_r = regressorModel.score(x_test, y_test)
print(f"MAE: {round(mae_r,2)} \nMSE: {round(mse_r,2)} \nRMSE: {round(rmse_r,2)} \nr2_score: {round(r2_score_r,2)}")

MAE: 8419.74 
MSE: 9349980784.52 
RMSE: 96695.3 
r2_score: -10.13


In [203]:
x = copy.deepcopy(X)
x = mmsc.fit_transform(x)

In [259]:
rfreg = RandomForestRegressor()
cv = KFold(n_splits=5, random_state=123, shuffle=True)
scores = cross_val_score(rfreg, x, y, scoring='r2', cv=cv, n_jobs=-1)

In [249]:
print('{:.20f}'.format(scores[-1]))

0.53260347675343910900


In [244]:
# Setting the parameters
n_estimators = [30,130,230,330] # number of trees in the random forest
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [245]:
from sklearn.model_selection import RandomizedSearchCV
rfreg1 = RandomForestRegressor()
rfreg1_random = RandomizedSearchCV(estimator=rfreg1, param_distributions=random_grid,
               n_iter=100, cv=5, verbose=2, random_state=35, n_jobs = 8)

In [246]:
import time
start = time.time()

rfreg1_random.fit(x_train, y_train)

elapsed = (time.time() - start)
print("Elapsed time: " + time.strftime("%H:%M:%S{}".format(str(elapsed % 1)[2:])[:8], time.gmtime(elapsed)))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Elapsed time: 00:12:18


In [247]:
print ('Best Parametersa: ', rfreg1_random.best_params_, ' \n')

Best Parametersa:  {'n_estimators': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': True}  



In [260]:
from numpy import mean
from numpy import std
print('R2: %.3f (%.3f)' % (mean(scores), std(scores)))

R2: -3.973 (8.810)


In [None]:
temp = pd.read_csv('craigslist_train.csv')
labelencoder = LabelEncoder()
temp[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = temp[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)

In [None]:
temp.to_csv('temp.csv', index=False)

In [None]:
temp1 = pd.read_csv('craigslist_test.csv')
labelencoder = LabelEncoder()
temp1[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = temp1[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)

In [None]:
temp.to_csv('temp1.csv', index=False)