# This notebook got ~0.8456 in Feb TPS using the Featurewiz library (see below)
##  Turn on the GPU Accelerator in this Notebook to get the fastest Results below using XGBoost

## Goal: Use Featurwiz to build a better ranking model in TPS
1.  Big_Mart Sales Prediction Score: 1147  -- Rank 250 out of 41,361 = That's a Top <1% Rank!!
1.  Loan Status Predictions Score 0.791  -- Rank 850 out of 67,424 - Top 1.25% Rank
1.  Machine Hack Flight Ticket Score 0.9389 -- Rank 165 out of 2723 - Top 6% Rank!
1.  Machine Hack Data Scientist Salary class Score 0.417 -- Rank 58 out of 1547 - Top 3.7% Rank! (Autoviml Score was 0.329 -- less than 0.417 of Featurewiz+Simple even though an NLP problem!)
1.  MCHACK Book Price NLP Score 0.7336 -- Rank 104 Autoviml NLP problem and should have done better

In [None]:
import pandas as pd
import numpy as np
import matplotlib.dates as md
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

# Install Featurewiz to perform feature engineering and selection

In [None]:
!pip install featurewiz

In [None]:
from featurewiz import FE_kmeans_resampler, FE_find_and_cap_outliers, EDA_find_outliers
from featurewiz import FE_convert_all_object_columns_to_numeric, split_data_n_ways, FE_create_categorical_feature_crosses
from featurewiz import FE_create_time_series_features, FE_concatenate_multiple_columns
from featurewiz import simple_XGBoost_model
import featurewiz as FW

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 500)

### Add my "Utility Script" named Load_kaggle from the File Menu above.###

In [None]:
from load_kaggle import load_kaggle

In [None]:
subm, train, test = load_kaggle()
print(train.shape, test.shape)
train.head(3)

In [None]:
target = 'target'
#df[target] = (df[target] - np.mean(df[target]))/np.std(df[target])
#train[target] = np.log(train[target].values)
idcols = ['id']
features = [x for x in list(test) if x not in idcols]

In [None]:
train = train[features+[target]]
df = train.copy(deep=True)
print(train.shape)
train.head(1)

In [None]:
df[target].hist()

In [None]:
train.loc[train[target]<=4] = 3.2

# Just use this one line of code to get ~0.8456 score in ~2 mins! But it can be improved using AutoViz insights (se below)

In [None]:
y_preds = FW.simple_XGBoost_model(X_XGB=train[features], Y_XGB=train[target], X_XGB_test=test[features], 
                               modeltype='Regression', log_y=False,
                               GPU_flag=True, scaler=StandardScaler(), enc_method='glmm', verbose=0)

In [None]:
y_preds1, model = y_preds

In [None]:
### Base model above with no feature engg gets you ~0.88 score which is a very nice score.
subm[target] = y_preds1
subm.to_csv('submission.csv',index=False)
subm.head()

# Let's use Auto_ViML with GPU to see if we can do better

In [None]:
!pip install autoviml

In [None]:
from autoviml.Auto_ViML import Auto_ViML

## Use AutoViz to gain some insights - here's what I learnt from looking at AutoViz charts


In [None]:
#!pip install autoviz

In [None]:
#from autoviz.AutoViz_Class import AutoViz_Class
#AV = AutoViz_Class()
#filename = ""
#sep = ","
#dft = AV.AutoViz(
#    filename,
#    sep=",",
#    depVar=target,
#    dfte=train,
#    header=0,
#    verbose=0,
#    lowess=False,
#    chart_format="svg",
#    max_rows_analyzed=30000,
#    max_cols_analyzed=30,
#)

## AutoViz tells us to do the following using Featurewiz ####

Tabular Playground Series - Feb 2021

1. numeric interaction vars and then bin them
('cont1','cont4'), ('cont4','cont6'),('cont4','cont13')

2. bin the following:
'cont1': 8, 'cont2':5, 'cont4':3,    'cont12':2, 'cont13':2, 

3. interaction cat vars - feature crosses


4. groupby vars
'cont5' by 'cat4', 'cont2' by 'cat5', 'cont7' by 'cat5'


5. log transform these
'cont7':'log', 'cont4':'log',

In [None]:
### Step 1: we create numeric interaction variables first ###
intxn_vars = [('cont1','cont4'), ('cont4','cont6'),('cont4','cont13')]

In [None]:
train = FW.FE_create_interaction_vars(train, intxn_vars)
test = FW.FE_create_interaction_vars(test, intxn_vars)
train.head(2)

In [None]:
### we must bin the above newly created discrete variables into 4 or 6 buckets. We will choose 6 for now
intx_cols = train.columns.tolist()[-3:]
intx_dict = dict(zip(intx_cols, [6]*3))
train, test = FW.FE_discretize_numeric_variables(train,intx_dict,test=test, strategy='gaussian')
print(train.shape, test.shape)
train.head(1)

In [None]:
preds = [x for x in list(test) if x not in idcols]
len(preds)

In [None]:
y_preds = simple_XGBoost_model(X_XGB=train[preds], Y_XGB=train[target], X_XGB_test=test[preds], 
                               modeltype='Regression', log_y=False,
                               GPU_flag=True, scaler=StandardScaler(), enc_method='glmm', verbose=0)

In [None]:
y_preds1, model = y_preds

### The CV scores are less with new features. ####### So it is not worth adding these features
### <Let us discard the new interaction variables and go back to the old train, test data > 

In [None]:
subm, train, test = load_kaggle()
print(train.shape, test.shape)
train.head(3)

In [None]:
### step 2: we bin the following numeric variables using gaussian mixture models
bin_these = {'cont1': 8, 'cont2':5, 'cont4':3,    'cont12':2, 'cont13':2}
train, test = FW.FE_discretize_numeric_variables(train,bin_these,test=test, strategy='gaussian')
print(train.shape, test.shape)

In [None]:
preds = [x for x in list(test) if x not in idcols]
len(preds)

In [None]:
output = simple_XGBoost_model(X_XGB=train[preds], Y_XGB=train[target], X_XGB_test=test[preds], 
                               modeltype='Regression', log_y=False,
                               GPU_flag=True, scaler=StandardScaler(), enc_method='glmm', verbose=0)

In [None]:
y_preds1, model = output

In [None]:
### The CV scores are not bad - let's keep these binned variables and add to them in next steps ##

In [None]:
### step 3: next we create feature crosses of these categorical variables ###
train = FW.FE_create_categorical_feature_crosses(train, ['cat4','cat5','cat6'])
test = FW.FE_create_categorical_feature_crosses(test, ['cat4','cat5','cat6'])
print(train.shape, test.shape)

In [None]:
preds = [x for x in list(test) if x not in idcols]
len(preds)

In [None]:
y_preds = simple_XGBoost_model(X_XGB=train[preds], Y_XGB=train[target], X_XGB_test=test[preds], 
                               modeltype='Regression', log_y=False,
                               GPU_flag=True, scaler=StandardScaler(), enc_method='glmm', verbose=0)

In [None]:
y_preds1, model = y_preds

In [None]:
### Absolutely no improvement - but we will keep these vars as long as performance is same! ####

In [None]:
### step 4: create groupby aggregates of the following numerics 
agg_nums = ['cont5','cont7','cont2']
groupby_vars = ['cat5','cat4']
train_add, test_add = FW.FE_add_groupby_features_aggregated_to_dataframe(train[agg_nums+groupby_vars], agg_types=['mean','std'],
                                groupby_columns=groupby_vars,
                                ignore_variables=[] , test=test[agg_nums+groupby_vars])

In [None]:
train_copy = train.join(train_add.drop(groupby_vars+agg_nums, axis=1))
test_copy = test.join(test_add.drop(groupby_vars+agg_nums, axis=1))
print(train_copy.shape, test_copy.shape)
train_copy.head(2)

In [None]:
preds = [x for x in list(test_copy) if x not in idcols]
len(preds)

In [None]:
output = simple_XGBoost_model(X_XGB=train_copy[preds], Y_XGB=train[target], X_XGB_test=test_copy[preds], 
                               modeltype='Regression', log_y=False,
                               GPU_flag=True, scaler=StandardScaler(), enc_method='glmm', verbose=0)

In [None]:
y_preds1, model = output

train,_ = FW.FE_split_one_field_into_many(train, field='Product', splitter='-', filler='missing')
test,_ = FW.FE_split_one_field_into_many(test, field='Product', splitter='-', filler='missing')
train.head(1)

train = FE_find_and_cap_outliers(train,[target], verbose=1)
#test = FE_find_and_cap_outliers(test,nums,verbose=0)

train = FE_create_time_series_features(train, 'Date')
test = FE_create_time_series_features(test, 'Date')
train.head(1)

In [None]:
###### step 5: log transform these columns ##########
log_cols = {'cont7':'log', 'cont4':'log'}
train_copy = FW.FE_transform_numeric_columns(train_copy, log_cols)
test_copy = FW.FE_transform_numeric_columns(test_copy, log_cols)
train_copy.head(2)

#### Lastly convert all object columns to numeric ############
train_copy, test_copy = FE_convert_all_object_columns_to_numeric(train_copy,test_copy)
print(train_copy.shape, test_copy.shape)
train_copy.head()

# Select the best features created using Featurewiz

In [None]:
train_best, test_best = FW.featurewiz(train_copy, target, test_data=test_copy,verbose=2)

In [None]:
def left_subtract(l1,l2):
    lst = []
    for i in l1:
        if i not in l2:
            lst.append(i)
    return lst


In [None]:
cats = train_copy.select_dtypes(include="object").columns.tolist()
len(cats)

In [None]:
sel_nums =  ['cont0', 'cont1', 'cont2', 'cont3', 'cont5', 'cont6', 'cont8', 'cont9', 'cont10', 'cont11', 'cont1_discrete', 'cont2_discrete', 'cont4_discrete', 'cont5_by_cat4_std', 'cont5_by_cat5_std', 'cont7_by_cat4_std', 'cont2_by_cat4_std', 'cont12_discrete', 'cont13_discrete', 'cont7_log', 'cont4_log']
preds = sel_nums+cats
print(len(preds))

In [None]:
### using reduced list of variables, the score actually drops 2% points! wow #######
y_preds = simple_XGBoost_model(X_XGB=train_copy[preds], Y_XGB=train[target], X_XGB_test=test_copy[preds], 
                               modeltype='Regression', log_y=False,
                               GPU_flag=True, scaler=StandardScaler(), enc_method='label', verbose=0)

In [None]:
y_preds1, model = y_preds

####
m, feats, trainm, testm = Auto_ViML(train_copy[preds+[target]], target, test_copy[preds],
                            sample_submission='',
                            scoring_parameter='', KMeans_Featurizer=False,
                            hyper_param='RS',feature_reduction=True,
                             Boosting_Flag=True, Binning_Flag=False,
                            Add_Poly=0, Stacking_Flag=True,Imbalanced_Flag=False,
                            verbose=1)

y_preds2 = testm['target_predictions'].values
y_preds2

In [None]:
subm = test[idcols]
#subm = pd.DataFrame()
subm[target] = y_preds1
subm.head()

In [None]:
subm.to_csv(target+'_Feb_submission2.csv',index=False)

# Autoviml got about 0.8746 in the Kaggle rankings. #######
###  This is slightly lower than 0.8845 that Autoviml got a month ago but it is about same as featurewiz
### The good news is that AutoviML and Featurewiz now produce results on a 300K dataset fast
### It takes less than 2 mins for Autoviml and Featurewiz to crunch this dataset! That's a huge leap.