In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from shutil import copyfile
copyfile(src = "../input/dataproc/data_proc.py", dst = "../working/data_proc.py")
from data_proc import *

from sklearn.model_selection import train_test_split
from fastai.tabular.all import *
from fastai.callback import *
import fastai ; print(fastai.__version__)

## Loading ML tools
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.tree import DecisionTreeRegressor #Decision Tree regressor
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import mean_absolute_error

#pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [None]:
PATH = Path('../input/craigslist-carstrucks-data')

import datatable as dt
data_tbl = dt.fread(PATH/"vehicles.csv")

In [None]:
df_raw=data_tbl.to_pandas()
df_raw.head().T

In [None]:
# remove of unused columns
drop_columns = ['C0','id','url','region_url','image_url','VIN','description','lat','long','posting_date']
df_cars_raw = df_raw.drop(columns = drop_columns)

# add age as float
df_cars_raw['age'] = (2020 - df_cars_raw['year']).astype(float)
df_cars_raw.tail().T

In [None]:
# there are still some numerical columns with missing values. We will use Fastai for preprocessing
df_cars_raw.isnull().sum()

# FASTAI tabular model

In [None]:
df_cars_no_price = df_cars_raw.query("price<100 or price>100000", engine='python').reindex()
df_cars = df_cars_raw.query("price>100 and price<100000", engine='python').reindex()

In [None]:
df_cars.query("price<100")

In [None]:
# When doing Regression with these large numbers, there is often used the log of target
#df_cars.loc[df_cars['price'] < 100, 'price'] = 100 # update the ~0 price to 100

# df_cars.loc[df_cars['price'] > 100000, 'price'] = 100000 # update the super high price to 100000
df_cars["price_log"] = np.log(df_cars['price'])
df_cars.head()

In [None]:
df_cars.hist('price',figsize=(12,7),bins=100,alpha=0.75)
plt.title('price distribution')
plt.ylabel('Number of objects')
plt.xlabel("total_price")

In [None]:
df_cars.hist('price_log',figsize=(12,7),bins=100,alpha=0.75)
plt.title('price log distribution')
plt.ylabel('Number of objects')
plt.xlabel("total_price_log")

# Memory adjustment, drop variables what are not used.

In [None]:
#drop unused variables from memory
import sys
def sizeof_fmt(num, suffix='B'):
    
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
del df_raw
del data_tbl
del df_cars_raw


# Split data

In [None]:
# Reserve 10% test 
print("Total lenght car_df:", len(df_cars))

df_training, df_test = train_test_split(df_cars, test_size=0.1)
#split  20% valid 
df_train, df_valid = train_test_split(df_training, test_size=0.25)
print("Split lenght: Train:", len(df_train)," Valid:",len(df_valid)," Test:", len(df_test))

# Preprocesing

In [None]:

df_train= df_train.drop(['price'], axis=1)
train_cats(df_train)
train_df,train_target,nas = proc_df(df_train,'price_log')

df_valid= df_valid.drop(['price'], axis=1)
train_cats(df_valid)
valid_df,valid_target,nas= proc_df(df_valid,'price_log')

#save column names for later
culumn_names = train_df.columns[0:]
culumn_names

# Normalization

In [None]:
#before normalization
train_df.tail()

In [None]:
import pandas as pd
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df.values))
valid_df = pd.DataFrame(min_max_scaler.transform(valid_df.values))
train_target = pd.DataFrame(min_max_scaler.fit_transform(train_target[:,None]))
valid_target = pd.DataFrame(min_max_scaler.transform(valid_target[:,None]))

In [None]:
#Restore column names
train_df.columns = culumn_names
valid_df.columns = culumn_names


In [None]:
#data after normalization
train_df.tail()

In [None]:
#format targets to series
train_target=train_target.values.ravel()
valid_target=valid_target.values.ravel()

# Baseline models


In [None]:
regr = linear_model.LinearRegression()
regr.fit(train_df, train_target)
print_score(regr,train_df, valid_df,train_target, valid_target)


# Random Forest Regressor

In [None]:
random_forest = RandomForestRegressor( n_jobs=-1, max_depth=30)
%time random_forest.fit(train_df, train_target)

print('Basic RandomForest model stats:')
print_score(random_forest,train_df, valid_df,train_target, valid_target)

In [None]:
preds = np.stack([t.predict(valid_df) for t in random_forest.estimators_])
preds[:,0], np.mean(preds[:,0]), valid_target[0]
plt.plot([metrics.r2_score(valid_target, np.mean(preds[:i+1], axis=0)) for i in range(40)]);

# Feature importance

In [None]:
fi = rf_feat_importance(random_forest,train_df); fi[:40]

# Tuning RandomForestRegressor hyperparameters

In [None]:
max_depth = [10,20,25,30,40,50]
min_samples_split = [2, 5, 10]
max_leaf_nodes = [500,1000, 1500,2000]
min_samples_leaf = [1, 2, 4]
n_estimators = [10,20,40,60,80]
max_features = ['sqrt','log',0.5,1,'auto']

hyperparameters = {'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'max_leaf_nodes': max_leaf_nodes,
                   'min_samples_leaf': min_samples_leaf,
                   'n_estimators': n_estimators,
                   'max_features': max_features
              }
print('Hyperparameters:')
pprint(hyperparameters )

In [None]:
random_forest = RandomForestRegressor(n_jobs=-1)
params_search = RandomizedSearchCV(estimator=random_forest,
                                   param_distributions=hyperparameters ,
                                   cv = 2, n_iter = 10, random_state=158,
                                   scoring = 'neg_mean_absolute_error')
params_search.fit(train_df, train_target)
print(" RandomizedSearchCV results " )
print("\n The best estimator across ALL searched params:\n", params_search.best_estimator_)
print("\n The best score across ALL searched params:\n", params_search.best_score_)
print("\n The best parameters across ALL searched params:\n", params_search.best_params_)

> # Runing random forest models with best parameters

In [None]:
random_forest = RandomForestRegressor(max_depth=20, max_features=0.5, max_leaf_nodes=2000,
                      min_samples_split=5, n_estimators=10, n_jobs=-1)
%time random_forest.fit(train_df, train_target)

print('Basic RandomForest model stats:')
print_score(random_forest,train_df, valid_df,train_target, valid_target)

In [None]:
random_forest = RandomForestRegressor(max_depth=50, max_features=0.5, max_leaf_nodes=1500,min_samples_leaf=4,
                      min_samples_split=5, n_estimators=60, n_jobs=-1)
%time random_forest.fit(train_df, train_target)

print('Basic RandomForest model stats:')
print_score(random_forest,train_df, valid_df,train_target, valid_target)

# XGBoost 

In [None]:
from xgboost import XGBRegressor,XGBClassifier

xgb_model = XGBRegressor(n_estimators=500,n_jobs=-1,learning_rate=0.1)
xgb_model.fit(train_df, train_target, 
             early_stopping_rounds=5, 
             eval_set=[(valid_df, valid_target)],
             verbose=False)



score_board = print_score(xgb_model,train_df, valid_df,train_target, valid_target)

In [None]:
xgb_model = XGBRegressor(n_jobs=-1,)

param_grid = {
        'max_depth': [5, 10, 15, 20,30],
        'learning_rate': [0.05, 0.1, 0.2, 0,3],
        'n_estimators': [100,500,1000,1500,2000]}


In [None]:
rs_clf = RandomizedSearchCV(xgb_model, param_grid, n_iter=20,
                            n_jobs=-1, verbose=2, cv=2,
                            
                            scoring='neg_log_loss', refit=False, random_state=42)
print("Randomized search..")
search_time_start = time.time()
rs_clf.fit(train_df, train_target,early_stopping_rounds=5,
           eval_set=[(valid_df, valid_target)],
             verbose=False,eval_metric= 'mlogloss', )
print("Randomized search time:", time.time() - search_time_start)

best_params = rs_clf.best_params_

print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

In [None]:
xgb_model = XGBRegressor(n_estimators=1500,n_jobs=-1,learning_rate=0.05,max_depth= 20)
xgb_model.fit(train_df, train_target, 
             early_stopping_rounds=10, 
             eval_set=[(valid_df, valid_target)],
             verbose=False)


score_board = print_score(xgb_model,train_df, valid_df,train_target, valid_target)


# Testing

In [None]:
#preprocesing

df_test = df_test.reset_index()

df_test=df_test.drop(columns='index')

#make var for predictions
df_test_to_model=df_test.drop(columns='price')

train_cats(df_test_to_model)
df_test_to_model,test_target,nas= proc_df(df_test_to_model,'price_log')

#normalize
df_test_to_model=pd.DataFrame(min_max_scaler.transform(df_test_to_model.values))


In [None]:
# Predict
y_pred = random_forest.predict(df_test_to_model)
#inverse normalization
y_pred_inverse= pd.DataFrame(min_max_scaler.inverse_transform(y_pred[:,None]))


df_test['predicted_price'] =np.exp(y_pred_inverse).round()


In [None]:
df_test[['price','predicted_price','region','year','manufacturer','model','condition','cylinders','fuel','title_status', 'transmission','drive', 'size', 'type', 'state']].head(20)

Test dataset what was without price

In [None]:
#preprocesing

df_cars_no_price = df_cars_no_price.reset_index()

df_cars_no_price=df_cars_no_price.drop(columns='index')
df_cars_no_price["price_log"] = np.log1p(df_cars_no_price['price'])

#make var for predictions
df_cars_no_price_to_model=df_cars_no_price.drop(columns='price')

train_cats(df_cars_no_price_to_model)
df_cars_no_price_to_model,test_target,nas= proc_df(df_cars_no_price_to_model,'price_log')

#normalize
df_cars_no_price_to_model=pd.DataFrame(min_max_scaler.transform(df_cars_no_price_to_model.values))


In [None]:
# Predict
y_pred = random_forest.predict(df_cars_no_price_to_model)
#inverse normalization
y_pred_inverse= pd.DataFrame(min_max_scaler.inverse_transform(y_pred[:,None]))


df_cars_no_price['predicted_price'] =np.expm1(y_pred_inverse).round()


In [None]:
#the teory what the price was not right for condition was confirmed, the price is predicted greater than 0
df_cars_no_price[['price','predicted_price','region','year','manufacturer','model','condition','cylinders','fuel',
                  'title_status', 'transmission','drive', 'size', 'type', 'state']].tail(20)