Ref Articles 
- https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
- https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
- https://discuss.analyticsvidhya.com/t/what-is-the-difference-between-predict-and-predict-proba/67376
- https://github.com/AnilBetta/AV-Janata-Hack-healh-Care-2/blob/master/av-jh-hca2-cat.ipynb
- https://github.com/gcspkmdr/HA-Hackathon

In [None]:
import pandas as pd
import itertools as it
import numpy as np
import math
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar


# solution to https://www.kaggle.com/c/DontGetKicked
VER = '01'


### Load data: 
tr = pd.read_csv('../input/DontGetKicked/training.csv')

target_name = 'IsBadBuy'
target = tr[target_name].copy(deep=True)
tr = tr.drop([target_name], axis=1)
tr_length = len(tr)

te = pd.read_csv('../input/DontGetKicked/test.csv')


### Concat train and test:
df = pd.concat([tr, te])
df = df.reset_index(drop=True)


### drop RefId and WheelTypeID' (because == 'WheelType):
df = df.drop(['RefId', 'WheelTypeID'], axis=1)




### Date features:
df['PurchDate'] = pd.to_datetime(df['PurchDate'])
df['Purch_Year'] = df['PurchDate'].dt.year
df['Purch_Month'] = df['PurchDate'].dt.month
df['Purch_Day'] = df['PurchDate'].dt.day
df['Purch_DayOfWeek'] = df['PurchDate'].dt.dayofweek
df['Purch_DayOfYear'] = df['PurchDate'].dt.dayofyear
df['Age_At_Purchase'] = df['Purch_Year'] - df['VehYear']
df['VehOdo'] = df['VehOdo'].apply(lambda x: int(math.ceil(x / 1000.0)) * 1000)


### Holidays:
def holidays():
    cal = calendar()
    dr = pd.date_range(start='2000-07-01', end='2019-07-31')
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    return holidays

df['Holiday'] = df['PurchDate'].isin(holidays())


### Ratio features:
def ratio(col1:str, col2:str, df):
    return (df[col1] - df[col2]) / df[col1]

Price_List = ['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',	
              'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',	
              'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',	
              'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice'
             ]

for i in Price_List:
    df[i] = df[i].apply(lambda p: np.nan if p==0 else p)

for combination in it.combinations(Price_List, 2):
    df[str(combination)] = ratio(*combination, df)

for i in Price_List:
    df['VehBCost_R_' + i] = ratio(i, 'VehBCost', df)


### Fractional features:
def frac(col1:str, col2:str, df):
    return df[col1] / df[col2]

df['Age_At_Purchase_f_VehicleAge'] = frac('Age_At_Purchase', 'VehicleAge', df)
df['Age_At_Purchase_f_VehOdo'] = frac('Age_At_Purchase', 'VehOdo', df) 
df['WarrantyCost_f_VehBCost'] = frac('WarrantyCost', 'VehBCost', df)
df['VehBCost_f_VehOdo'] = frac('VehBCost', 'VehOdo', df)
df['VehBCost_f_VehicleAge'] = frac('VehBCost', 'VehicleAge', df)


### Submodel features:
def string_detector(str_to_check:str):
    return df['SubModel'].apply(lambda s: 1 if str_to_check in str(s) else 0)

# Manually defined list:
strings_to_check = ['4D', '2D', 'CAB', 'WAGON', 'CONVERTIBLE',
                    'HATCHBACK', 'CREW', 'SEDAN', 'SUV', 'MINIVAN',
                    'PASSENGER', 'REG', 'QUAD', 'UTILITY', 'CARGO', 
                    'EXT', 'SPORT', 'COUPE', 'CUV', 'DOUBLE',
                   ]
for i in strings_to_check:
    df[i] = string_detector(i)

df['BYRNO'] = df['BYRNO'].astype(str)


### Split back to train & test, saving:
tr = df.iloc[:tr_length, :].copy(deep=True)
tr[target_name] = target
tr.to_csv('Carv_train_'+VER+'.csv', index=False)

te = df.iloc[tr_length:, :].copy(deep=True)
te.to_csv('Carv_test_'+VER+'.csv', index=False)

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
#from catboost import CatBoostClassifier
#from sklearn.model_selection import StratifiedKFold,KFold,GroupKFold
#from sklearn.metrics import accuracy_score

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#For Missing Value and Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, f_classif, VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import time


## Import Data

In [None]:
train = pd.read_csv("./Carv_train_01.csv")
test = pd.read_csv("./Carv_test_01.csv")

In [None]:
train.head()

## SMOTE

In [None]:
#insert code

## Feat Engineering

In [None]:
# Date
#PurchDate

In [None]:
train['mean_MMRAcquisitionAuctionAveragePrice_Make']=train.groupby(['Make'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
train['mean_MMRAcquisitionAuctionAveragePrice_Model']=train.groupby(['Model'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
train['mean_MMRAcquisitionAuctionAveragePrice_Trim']=train.groupby(['Trim'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
train['mean_MMRAcquisitionAuctionAveragePrice_SubModel']=train.groupby(['SubModel'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
train['mean_MMRAcquisitionAuctionAveragePrice_Color']=train.groupby(['Color'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
train['mean_MMRAcquisitionAuctionAveragePrice_Transmission']=train.groupby(['Transmission'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')

In [None]:
test['mean_MMRAcquisitionAuctionAveragePrice_Make']=test.groupby(['Make'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
test['mean_MMRAcquisitionAuctionAveragePrice_Model']=test.groupby(['Model'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
test['mean_MMRAcquisitionAuctionAveragePrice_Trim']=test.groupby(['Trim'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
test['mean_MMRAcquisitionAuctionAveragePrice_SubModel']=test.groupby(['SubModel'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
test['mean_MMRAcquisitionAuctionAveragePrice_Color']=test.groupby(['Color'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')
test['mean_MMRAcquisitionAuctionAveragePrice_Transmission']=test.groupby(['Transmission'])['MMRAcquisitionAuctionAveragePrice'].transform('mean')

## Divide Dataset into X and Y

In [None]:
#create X and y datasets for splitting 
X = train.drop(['IsBadBuy'], axis=1)
y = train['IsBadBuy']

In [None]:
all_features = X.columns

In [None]:
all_features = all_features.tolist()

In [None]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c !='PassengerId']
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [None]:
numerical_features

In [None]:
categorical_features

In [None]:
X = pd.get_dummies(X)


In [None]:
#import train_test_split library
from sklearn.model_selection import train_test_split

# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0)  

## Setup Pipeline 

In [None]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    #SimpleImputer(strategy = 'median'),
    KNNImputer(n_neighbors=2, weights="uniform"),
    MinMaxScaler()), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
    
)

In [None]:
preprocessor_best = make_pipeline(preprocessor, 
                                  VarianceThreshold(), 
                                  SelectKBest(f_classif, k = 50)
                                 )

In [None]:
from lightgbm import LGBMClassifier

RF_Model = make_pipeline(preprocessor_best, LGBMClassifier(n_estimators = 100))

## Grid Search

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 50)]
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

# Minimum number of samples required to split a node
#min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
#bootstrap = [True, False]

In [None]:
RF_Model.get_params().keys()

In [None]:
# Create the param grid
param_grid = {'lgbmclassifier__n_estimators': n_estimators,
               'lgbmclassifier__max_depth': max_depth
               #'randomforestclassifier__min_samples_split': min_samples_split,
               #'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               #'randomforestclassifier__bootstrap': bootstrap
             }
print(param_grid)

In [None]:
from lightgbm import LGBMClassifier


In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = RF_Model, param_distributions = param_grid, cv = 3, verbose=1, n_jobs = -1, n_iter = 5, scoring = 'f1')

In [None]:
import pandas as pd

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [None]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
rf_RandomGrid.fit(X_train, y_train)

In [None]:
rf_RandomGrid.best_estimator_

## Accuracy

In [None]:
print(f'Train : {rf_RandomGrid.score(X_train, y_train):.3f}')
print(f'Test : {rf_RandomGrid.score(X_test, y_test):.3f}')

## Gini Index

In [None]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [None]:
actual_train = y_train
pred_train = rf_RandomGrid.predict(X_train)
actual_test = y_test
pred_test = rf_RandomGrid.predict(X_test)

In [None]:
print(f'Gini Train : {gini(actual_train,pred_train):.3f}')
print(f'Gini Test : {gini(actual_test,pred_test):.3f}')

## Submission 

In [None]:
test_pred = rf_RandomGrid.predict_proba(test[X.columns])[:,1]

In [None]:
AllSub = pd.DataFrame({ 'RefId': test['RefId'],
                       'IsBadBuy' : test_pred
    
})

In [None]:
AllSub['IsBadBuy'] = AllSub['IsBadBuy'].apply(lambda x: 1 if x > 0.09 else 0)

In [None]:
AllSub.to_csv('DGK_RF_Pipe_BetterPipe1.csv', index = False)

In [None]:
from sklearn.feature_selection import RFE # Recursive Feature Selection
from sklearn.feature_selection import RFECV # Recursive Feature Selection with Cross Validation

In [None]:
# Initializing Random Forest Classifier
# Initializing the RFE object, one of the most important arguments is the estimator, in this case is RandomForest
