In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import gc
import warnings
import joblib
from tqdm.notebook import tqdm
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold,cross_val_score,GridSearchCV,train_test_split
from sklearn.ensemble import ExtraTreesRegressor, StackingRegressor
from lightgbm import LGBMRegressor
#from xgboost import XGBRegressor
#from catboost import CatBoostRegressor
plt.style.use('ggplot')
warnings.filterwarnings("ignore")

In [None]:
os.getcwd()

### Preprocessing

In [None]:
dataset = pd.read_csv('../input/car-price-prediction/CarPrice_Assignment.csv')
dataset.head()

In [None]:
dataset.info()

Feature Split

In [None]:
features = [col for col in dataset.columns.tolist() if col not in ['price','car_ID']]
numerical_features = []
categorical_features = [] 
target = dataset.price
dataset = dataset[features]

for col in features:
    if dataset[col].dtype in ['int64', 'int32', 'float64', 'float32']:
        numerical_features.append(col);
    else:
        categorical_features.append(col)
        
print('There are {:} numerical features.'.format(len(numerical_features)))
print('There are {:} categorical features.'.format(len(categorical_features)))

In [None]:
dataset.CarName.value_counts()

In [None]:
dataset.loc[0,'CarName'].split(' ')[0]

CarName extract

In [None]:
for item in range(len(dataset)):
    dataset.loc[item,'CarName'] = dataset.loc[item,'CarName'].split(' ')[0]
    
dataset[categorical_features].head()

In [None]:
dataset.CarName.value_counts()

In [None]:
rare_name = ['isuzu','porsche','jaguar','chevrolet',
             'alfa-romero','maxda','vw','renault',
             'mercury','vokswagen','vokswagen','toyouta','Nissan','porcshce']

for item in range(len(dataset)):
    if dataset.loc[item,'CarName'] in rare_name:
        dataset.loc[item,'CarName'] = 'rare'
    else:
        pass

In [None]:
dataset[categorical_features].describe().T

In [None]:
dataset = pd.get_dummies(dataset, columns=categorical_features)
dataset.head()

Add features

In [None]:
dataset['carvolume'] = dataset['carlength'] * dataset['carwidth'] * dataset['carheight']
dataset['totalmpg'] = dataset['citympg'] + dataset['highwaympg']
numerical_features.append('carvolume')
numerical_features.append('totalmpg')

Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

categorical_features = [col for col in dataset.columns.tolist() if col not in numerical_features]

scaler = StandardScaler()
scaler = scaler.fit(dataset[numerical_features])
dataset_scaler = scaler.transform(dataset[numerical_features])
dataset_scaler = pd.DataFrame(dataset_scaler)
dataset_scaler.columns = numerical_features
dataset_scaler = pd.concat([dataset_scaler,dataset[categorical_features]], axis=1)
dataset_scaler.head()

### Modeling

In [None]:
features = dataset_scaler.values
features.shape, target.shape

In [None]:
seeds = [42, 73, 111, 123, 2021]    
SEED = seeds[0]
NUM_TRAIN_SAMPLES = features.shape[0]

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)  
    
def count_data_items(data):
    return len(data)

In [None]:
lgb_params = {'bagging_fraction': 0.8, 'boosting_type': 'gbdt', 
              'colsample_bytree': None, 'feature_fraction': 0.7, 
              'lambda_l1': 2, 
              'lambda_l2': 2, 
              'learning_rate': 0.1, 
              'max_depth': -1, 
              'metrics': 'rmse', 
              'min_child_samples': None, 
              'min_child_weight': 15.586, 
              'min_data_in_leaf': 6, 
              'min_sum_hessian_in_leaf': None,
              'n_estimators': 5000, 
              'num_leaves': 70,
              'reg_alpha': None,
              'reg_lambda': None, 
              'seed': 7, 
              'subsample': None,
              'n_jobs' : 16,
              'verbose': -1}

oof train loop **with plabel**

In [None]:
def train_and_evaluate(folds = 5):
    oof_targets = np.zeros(NUM_TRAIN_SAMPLES)
    oof_predictions = np.zeros(NUM_TRAIN_SAMPLES)
    previous_number_of_files = 0
    total_number_of_files = 0
    
    seed_everything(SEED)
    kfold = KFold(folds, shuffle=True, random_state=SEED)
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(features, target)):
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1} begin :')
        
        train_x, train_y = features[trn_ind], target[trn_ind]
        val_x, val_y  = features[val_ind], target[val_ind]
        
        model = LGBMRegressor(**lgb_params)
        model.fit(train_x, train_y, eval_set=(val_x, val_y), early_stopping_rounds=200, verbose=False)
        gc.collect() 
        
        # make pesudo label
        pesudo_label = model.predict(val_x)
        X_agg = np.concatenate([train_x,val_x],axis=0)
        y_agg = np.concatenate([train_y,pesudo_label],axis=0)
        # train again
        model = LGBMRegressor(**lgb_params)
        model.fit(X_agg, y_agg, eval_set=(val_x, val_y), early_stopping_rounds=200, verbose=False)
        gc.collect() 
        
        joblib.dump(model,f'./lgb_{fold}_PLabel.pkl')
        
        number_of_files = count_data_items(val_y)
        total_number_of_files += number_of_files
        oof_targets[previous_number_of_files:total_number_of_files] = val_y
        probabilities = model.predict(val_x)
        oof_predictions[previous_number_of_files:total_number_of_files] = probabilities
        previous_number_of_files += number_of_files
        
        print('\n')
        print('-'*50)
        fold_r2_score = r2_score(val_y, probabilities)
        fold_rmse_score = np.sqrt(mean_squared_error(val_y, probabilities))
        print(f'Our fold {fold + 1} rmse score validation is {fold_rmse_score}')
        print(f'Our fold {fold + 1} r2 score validation is {fold_r2_score}')
        
    print('\n')
    print('-'*50)
    oof_r2_score = r2_score(oof_targets, oof_predictions)
    oof_rmse_score = np.sqrt(mean_squared_error(oof_targets, oof_predictions))
    print(f'Our out of folds rmse score is {oof_rmse_score}')
    print(f'Our out of folds r2 score is {oof_r2_score}')
    
    print('Saving out of folds to disk...')
    target_columns = ['Tc']
    prediction_columns = [col + ' Prob' for col in target_columns]
    oof_targets_df = pd.DataFrame(oof_targets, columns=target_columns)
    oof_predictions_df = pd.DataFrame(oof_predictions, columns=prediction_columns)
    
    oof_dataset = pd.concat([oof_targets_df, oof_predictions_df], axis=1)
    oof_dataset.to_csv(f'./lgbm_oof_{SEED}.csv', index=False)

In [None]:
# Train & prediction
train_and_evaluate(folds = 5)

visualization

In [None]:
from sklearn.metrics import mean_squared_error as mse
def PerformanceCalculator(trueVals, predVals, name):
    plt.plot([0,0.001,0.01,1], [0,0.001,0.01,1], color = 'blue')
    plt.scatter(trueVals, predVals, color = 'green')
    er = mse(trueVals, predVals)
    er = pow(er, 0.5)
    er = int(er * 10000) / 10000
    r2 = np.round(r2_score(trueVals, predVals),4)
    plt.title('RMSE: ' + str(er) + ' for '+ name)
    plt.plot([2500,50000], [2500,50000], '--', lw=2, c='r')
    plt.xlim(2500,50000)
    plt.ylim(2500,50000)
    print('R2: ' + str(r2) + ' for '+ name)
    plt.show()

In [None]:
oof_dataset = pd.read_csv(f'./lgbm_oof_{SEED}.csv')
oof_dataset.head()

In [None]:
PerformanceCalculator(oof_dataset.Tc, oof_dataset['Tc Prob'], 'LGBM single model')