

## Feature Engineering

We will add some more features.

In [2]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.special import inv_boxcox, boxcox1p

# supress unnecessary warnings for readability and cleaner presentation
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('bmh')

In [3]:
# helper function which performs all steps of data processing which we applied during 
# Data cleaning and preprocessing
def read_clean_data():
    
    # reading data
    df_train = pd.read_csv('use_case_data.csv')
    df_score = pd.read_csv('score_data.csv')

    df_train['data_type'] = 'train'
    df_score['data_type'] = 'score'
    
    # removing negative market shares
    temp = df_train[df_train['Market_Share'] >= 0]
    
    # applying boxcox
    market_share = stats.boxcox(temp['Market_Share'].values + 1e-30, lmbda=0.25)
    
    # combining train and score
    data = pd.concat([temp.drop(columns='Market_Share'), df_score],ignore_index=True)
    
    # adding response variable
    data['Market_Share'] = np.nan
    mask = data['data_type'] == 'train'
    data.loc[mask, 'Market_Share'] = market_share
    data.drop(columns='data_type', inplace=True)
    
    # changing dtype to category
    data['ITEMSCODE'] = data['ITEMSCODE'].astype('category')

    # spliting to year and month, adding to not_useful_features list
    data['LAUNCH_YEAR'], data['LAUNCH_MONTH'] = data['NPLLAUNCHDATE'].map(str).apply(lambda x: [x[:4], x[4:]]).str   
    
    # removing not useful features
    not_useful = ['BRMID', 'LATESTPERIODINDEX', 'NPLLAUNCHDATE', 'ISREPLACEMENT', 'BRM', 'MARKETEDBRAND',
                  'BRANDSUBFAMILY', 'NPLLAUNCHYEAR', 'RTYPE', 'ITEMSHAPE']
    
    data.drop(columns=not_useful, inplace=True)
    
    # filling missing values
    data['SPECIALFLAVOR'].fillna('NOSPECIALFLAVOR', inplace=True)
    data['TIPCOLOR'].fillna('NOTIPCOLOR', inplace=True)
    
    # transform categorical features into the appropriate type
    for c in data.columns:
        col_type = data[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            data[c] = data[c].astype('category')
            
    # transforming with boxcox1p for reducing skew
    data[['LEN', 'NCON', 'RETAILPACKPRICE']] = boxcox1p(data[['LEN', 'NCON', 'RETAILPACKPRICE']], -0.25)

    print('all data shape: {}'.format(data.shape))
    return data

We will add some more features by aggregating some categorical features by numeric ones.

For example, we will calculate **mean, median, std, skew** of **RETAILPACKPRICE** for each **REGION**. 

In [4]:
def aggregate(data, f_1, f_2):
    """Aggregates f_1 by f_2"""

    mean_ = '{}_{}_mean'.format(f_1, f_2)
    median_ = '{}_{}_median'.format(f_1, f_2)
    std_ = '{}_{}_std'.format(f_1, f_2)
    skew_ = '{}_{}_skew'.format(f_1, f_2)
    
    # aggregation of f_1 by f_2
    temp = data.groupby(f_1)[f_2].aggregate({
        mean_: np.mean,
        std_: np.std,
        median_: np.median,
        skew_: stats.skew
    }).reset_index()
    
    # filling nans with 0
    for x in (mean_, std_, median_, skew_):
        temp[x].fillna(0, inplace=True)
        
    data = data.merge(temp)
    
    # adding relative (for example product RETAILPACKPRICE / REGION mean RETAILPACKPRICE)
    relative_mean_ = '{}_{}_relative_mean'.format(f_1, f_2)
    relative_median_ = '{}_{}_relative_median'.format(f_1, f_2)
    
    def relative(row, f):
        f_value, f_2_value = row[f], row[f_2]
        if f_value == 0 and f_2_value == 0:
            return 1.
        elif f_value == 0 and f_2_value != 0:
            return -99999
        else:
            return f_2_value / f_value
        
#     data[relative_mean_] = data.apply(lambda row: relative(row, mean_), axis=1)
    data[relative_median_] = data.apply(lambda row: relative(row, median_), axis=1)
    return data

In [5]:
def feature_engineering(data, to_drop=None, to_aggregate=None, to_dummy=False, cardinality_ratio=1., ):
    """
    Recives cleaned and preprocessed data, add features, high cardinality features and makes one-hot encoding.
    """
    temp = data.copy()

    # removing high cardinality categorical features
    if cardinality_ratio < 1.:
        n = len(temp)
        high_cardinalty_features = []
        for c in temp.columns:
            if temp[c].dtype.name == 'category' and len(temp[c].unique()) / n > cardinality_ratio:
                high_cardinalty_features.append(c)
        print('Removing {} features, which have high cardinality.'.format(high_cardinalty_features))
        temp.drop(columns=high_cardinalty_features, inplace=True)
    
    # aggergation
    numeric_fs = [c for c in temp.columns if temp[c].dtype.name != 'category' and c != 'Market_Share']
    for f_1 in to_aggregate:
        for f_2 in numeric_fs:
            temp = aggregate(temp, f_1, f_2)
    
    # adding also **2, **3 and **0.5 for LEN and RETAILPACKPRICE
    for x in ['LEN', 'RETAILPACKPRICE']:
        temp['{}_**2'.format(x)] = temp[x] ** 2
        temp['{}_**3'.format(x)] = temp[x] ** 2
        temp['{}_**0.5'.format(x)] = np.sqrt(np.abs(temp[x]))        
    print('all data shape after features addition: {}'.format(temp.shape))
    
    # dropping some features
    if to_drop:
        temp.drop(columns=to_drop, inplace=True)
        print('all data shape after features deletion: {}'.format(temp.shape))
    

    # one-hot encoding for categorical features
    if to_dummy:
        temp = pd.get_dummies(temp)
        print('all data shape after one-hot-encoding: {}'.format(temp.shape))

    # splitting into train and score
    mask = temp['Market_Share'].notnull()
    training_data, score_data = temp[mask], temp[np.invert(mask)]
    score_data.drop(columns='Market_Share', inplace=True)
    print('training data shape: {}, score data shape: {}'.format(
        training_data.shape, score_data.shape))
    return training_data, score_data

## Evaluation metrics

In [6]:
import itertools
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.base import clone
from sklearn.model_selection import KFold, train_test_split
from tabulate import tabulate

In [7]:
def plot_confusion_matrix(matrix, classes, cmap=plt.cm.Reds):
    """This function plots the normalized confusion matrix."""
    matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

    plt.imshow(matrix, interpolation='nearest', cmap=cmap)
    plt.title('Normalized Confusion matrix', fontsize=17)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=15)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=12)
    plt.yticks(tick_marks, classes, fontsize=12)

    fmt = '.2f'
    thresh = matrix.max() / 2.
    for i, j in itertools.product(range(matrix.shape[0]),range(matrix.shape[1])):
        plt.text(j, i, format(matrix[i, j], fmt),
                 horizontalalignment="center", fontsize=17,
                 color="blue" if matrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label', fontsize=12)
    plt.xlabel('Predicted label', fontsize=12)
    plt.show()
    

def plot_model_performance(y_true, y_pred, matrix):
    """
    This function plots:
        - predicted values vs actual values
        - confusion matrix
        - roc curve
    """
    print()
    
    # plotting predicted values vs actual ones
    plt.figure(figsize=(10, 10))
    plt.scatter(y_true, y_pred)
    plt.title('Actual values vs Predicted values.')
    plt.ylabel('Predicted', fontsize=12)
    plt.xlabel('Actual', fontsize=12)
    plt.show()
              
    print()
    
    # plotting confusion matrix
    plot_confusion_matrix(matrix, ['Failure', 'Success'])
    
    print()
    threshold = 0.007
    threshold_transformed = stats.boxcox(threshold, 0.25)
    binary_target = y_true > threshold_transformed
    
    fpr, tpr, _ = roc_curve(binary_target, y_pred)
    auc_score = auc(fpr, tpr)
    
    plt.figure(figsize=(10, 10))
    plt.plot(fpr, tpr, lw=3, alpha=0.3, label='ROC (AUC = %0.2f)' % (auc_score))
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()

In [8]:
def scorer(y_true, y_pred):
    """Calculates differnt merics for evaluating our models."""

    threshold = 0.007
    threshold_transformed = stats.boxcox(threshold, 0.25)

    binary_prediction = y_pred > threshold_transformed
    binary_target = y_true > threshold_transformed

    """Regression metrics"""
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    explained_variance = explained_variance_score(y_true, y_pred)
    r_2 = r2_score(y_true, y_pred)

    """Classification metrics"""

    # The probability that the model ranks a random
    # positive example more highly than a random negative example.
    auc = roc_auc_score(binary_target, y_pred)

    conf_matrix = confusion_matrix(binary_target, binary_prediction)
    # true positives (TP): We predicted Y and they do have the disease.
    # true negatives (TN): We predicted N, and they don't have the disease.
    # false positives (FP): We predicted Y, but they don't have the disease.
    # false negatives (FN): We predicted N, but they do have the disease.
    tn, fp, fn, tp = conf_matrix.ravel()

    # correct labels
    total = tn + fp + fn + tp
    actual_yes = fn + tp
    actual_no = total - actual_yes

    # Overall, how often is the classifier correct?
    accuracy = (tp + tn) / total

    # Overall, how often is it wrong?
    misclassification_rate = (fp + fn) / total

    # When it is actually Y, how often does it predict Y?
    # Probability that a test result will be positive when the disease is present.
    # Recall
    tp_rate = tp / actual_yes

    # When it is actually N, how often does it predict N?
    # Probability that a test result will be negative when the disease is not present.
    specificity = tn / actual_no

    # When it is actually N, how often does it predict Y?
    # 1 - specificity
    fp_rate = fp / actual_no

    # When it predicts Y, how often is it correct?
    # Probability that the disease is present when the test is positive.
    # Positive Predictive Value or precision
    pp_value = tp / (fp + tp)

    # When it predicts N, how often is it correct?
    # Probability that the disease is not present when the test is negative.
    # Negative Predictive Value
    np_value = tn / (tn + fn)

    # The weighted average of recall and precision.
    f_score = 2 * tp_rate * pp_value / (tp_rate + pp_value)

    return {
        'rmse': rmse,
        'explained_variance': explained_variance,
        'r_2': r_2,
        'confusion_matrix': conf_matrix,
        'accuracy': accuracy,
        'tpr': tp_rate,
        'specificity': specificity,
        'fpr': fp_rate,
        'ppv': pp_value,
        'npv': np_value,
        'f_score': f_score,
        'auc': auc,
    }



**5 fold CV strategy: we will validate our models and tune hyperparameters by 5 fold CV**

In [9]:
def cv_mean_score(fold_scores):
    """Calculates model mean score based on list of each folds scores."""
    if not fold_scores:
        return
    keys = list(fold_scores[0].keys())
    data = {k: [x[k] for x in fold_scores] for k in keys if k != 'confusion_matrix'}
    return {k: (np.mean(v), np.std(v)) for k, v in data.items()}   

def cv(model, train_data, train_y, n_folds=5):
    """Helper function for doing cross validation and collecting metrics."""

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    training_metrics, validatin_metrics = [], []
    for i, (train_ind, valid_ind) in enumerate(kf.split(train_data)):
        model_ = clone(model)
        model_.fit(train_data.iloc[train_ind], train_y.iloc[train_ind])

        train_y_pred = model_.predict(train_data.iloc[train_ind])
        t_metrics = scorer(train_y.iloc[train_ind].values, train_y_pred)

        valid_y_pred = model_.predict(train_data.iloc[valid_ind])
        v_metrics = scorer(train_y.iloc[valid_ind].values, valid_y_pred)

        print('\nfold: {}\n'.format(i + 1))
        tabular_metrics = [[k, t_metrics[k], v_metrics[k]]
                           for k in t_metrics.keys()
                           if k != 'confusion_matrix']
        
        print(tabulate(tabular_metrics,
                       headers=['metric_name', 'training_set', 'validation_set'],
                       tablefmt="fancy_grid",
                       floatfmt=",.3f"))

        training_metrics.append(t_metrics)
        validatin_metrics.append(v_metrics)

    mean_t_score = cv_mean_score(training_metrics)
    mean_v_score = cv_mean_score(validatin_metrics)

    tabular_mean_metrics = [
        [k, mean_t_score[k][0], mean_t_score[k][1], mean_v_score[k][0], mean_v_score[k][1]]
        for k in mean_t_score.keys()]

    tabular_mean_metrics = tabulate(
        tabular_mean_metrics,
        headers=['metric_name', 'train: mean', 'train: std', 'valid: mean', 'valid: std'],
        tablefmt="fancy_grid",
        floatfmt=",.3f")
    return tabular_mean_metrics

In [10]:
def evaluate_model(model, data, y, training=True):
    """Evaluates model on train or test data."""
    
    if training:
        # fitting model before prediction
        model.fit(data, y)
    
    # predicition
    y_pred = model.predict(data)
    
    metrics = scorer(y, y_pred)
    plot_model_performance(y, y_pred, metrics.pop('confusion_matrix'))
    for k, v in metrics.items():
        print('{}: {:.3f}'.format(k, v))
    
    return metrics    



## Modeling

**Import librairies**

In [11]:
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb

**Preparing data: data clening, preprocessing and adding  additional features, one-hot-encoding**

In [12]:
df = read_clean_data()

# split into numeric and categorical
numeric_fs = [c for c in df.columns if df[c].dtype.name != 'category' and c != 'Market_Share']
categoircal_fs = list(set(df.columns) - set(numeric_fs) - {'Market_Share'})

all data shape: (1721, 32)


In [13]:
# categorical features, which will be aggregated
to_aggregate = [
    'TCLASS',
    'REGION',
    'LOCALCLASS',
    'BLDIMAGE', 
    'MARKET',
    'MINDICATOR', 
    'THICATEGORY', 
    'PCKT',
]

# categorical features, which will be dropped
to_drop = [
    'BRANDSUBFAMILYGROUPING', 
    'BRANDSUBFAMILYLINE',
    'BRANDSUBFAMILYGROUP', 
    'BRANDONMARKET',
    'BRANDDIFFERENTIATOR',
]

training_df, score_df = feature_engineering(df, 
                                            to_drop=to_drop, 
                                            to_aggregate=to_aggregate,
                                            to_dummy=True, 
                                            cardinality_ratio=1.)

numeric_fs = [c for c in training_df.columns if training_df[c].dtype.name != 'category' and c != 'Market_Share']
categoircal_fs = list(set(training_df.columns) - set(numeric_fs) - {'Market_Share'})

all data shape after features addition: (1721, 198)
all data shape after features deletion: (1721, 193)
all data shape after one-hot-encoding: (1721, 517)
training data shape: (1692, 517), score data shape: (29, 516)


**Train - Test split: we will keep 15% of data for final testing**

In [14]:
train, test = train_test_split(training_df, test_size=0.15, shuffle=True)
print(train.shape, test.shape)

(1438, 517) (254, 517)


In [15]:
# separating response variable
y_train, X_train = train['Market_Share'], train.drop(columns='Market_Share')
y_test, X_test = test['Market_Share'], test.drop(columns='Market_Share')

**Base models and their scores on 5 fold CV:**

As models we have selected 4 models: Lasso regression, Elastic Net, ExtraTress regression and Gradient tree boosting. First two models are more linear than other two. First 2 models are modifications of linear regession with additional constraints on regression coefficients. The last 2 models are more no-linear models based on decision trees.

In [24]:
all_models_scores = {}

* **LASSO Regression :**

This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's **Robustscaler()** method on pipeline.

In [25]:
model_lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0003, max_iter=1e3, tol=1e-5, random_state=1))
score = cv(model_lasso, X_train, y_train)
all_models_scores['lasso'] = score
print('\nLasso score:\n')
print(score)


fold: 1

╒════════════════════╤════════════════╤══════════════════╕
│ metric_name        │   training_set │   validation_set │
╞════════════════════╪════════════════╪══════════════════╡
│ rmse               │          0.208 │            0.290 │
├────────────────────┼────────────────┼──────────────────┤
│ explained_variance │          0.566 │            0.186 │
├────────────────────┼────────────────┼──────────────────┤
│ r_2                │          0.566 │            0.184 │
├────────────────────┼────────────────┼──────────────────┤
│ accuracy           │          0.858 │            0.833 │
├────────────────────┼────────────────┼──────────────────┤
│ tpr                │          0.437 │            0.306 │
├────────────────────┼────────────────┼──────────────────┤
│ specificity        │          0.959 │            0.941 │
├────────────────────┼────────────────┼──────────────────┤
│ fpr                │          0.041 │            0.059 │
├────────────────────┼────────────────┼───────

* **Elastic Net Regression :**

again made robust to outliers

In [26]:
model_enet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.8, random_state=3))
score = cv(model_enet, X_train, y_train)
all_models_scores['elastic_net'] = score
print('\nElastic Net:\n')
print(score)


fold: 1

╒════════════════════╤════════════════╤══════════════════╕
│ metric_name        │   training_set │   validation_set │
╞════════════════════╪════════════════╪══════════════════╡
│ rmse               │          0.214 │            0.291 │
├────────────────────┼────────────────┼──────────────────┤
│ explained_variance │          0.541 │            0.182 │
├────────────────────┼────────────────┼──────────────────┤
│ r_2                │          0.541 │            0.181 │
├────────────────────┼────────────────┼──────────────────┤
│ accuracy           │          0.854 │            0.833 │
├────────────────────┼────────────────┼──────────────────┤
│ tpr                │          0.405 │            0.306 │
├────────────────────┼────────────────┼──────────────────┤
│ specificity        │          0.961 │            0.941 │
├────────────────────┼────────────────┼──────────────────┤
│ fpr                │          0.039 │            0.059 │
├────────────────────┼────────────────┼───────

* **Extra Trees Regressor:**

In [27]:
model_extratree = ExtraTreesRegressor(
    n_estimators=200,
    max_depth=50,
    min_samples_split=2, 
    min_samples_leaf=2,
    min_weight_fraction_leaf=0.0002,
    max_leaf_nodes=None,
    max_features='auto',
    min_impurity_decrease=0.00,
    n_jobs=2
    
)

score = cv(model_extratree, X_train, y_train)
all_models_scores['extra_tree'] = score
print('\nExtraTree:\n')
print(score)


fold: 1

╒════════════════════╤════════════════╤══════════════════╕
│ metric_name        │   training_set │   validation_set │
╞════════════════════╪════════════════╪══════════════════╡
│ rmse               │          0.064 │            0.255 │
├────────────────────┼────────────────┼──────────────────┤
│ explained_variance │          0.958 │            0.372 │
├────────────────────┼────────────────┼──────────────────┤
│ r_2                │          0.958 │            0.372 │
├────────────────────┼────────────────┼──────────────────┤
│ accuracy           │          0.977 │            0.844 │
├────────────────────┼────────────────┼──────────────────┤
│ tpr                │          0.928 │            0.306 │
├────────────────────┼────────────────┼──────────────────┤
│ specificity        │          0.989 │            0.954 │
├────────────────────┼────────────────┼──────────────────┤
│ fpr                │          0.011 │            0.046 │
├────────────────────┼────────────────┼───────

* **LightGBM :**

In [28]:
model_lgb = lgb.LGBMRegressor(
    boosting_type='gbdt',
    
    objective='rmse',
    n_estimators=5000,
    learning_rate=0.03,
    
    max_depth=4,             # Specify the max depth to which tree will grow. 
    num_leaves=15,             # Number of leaves in one tree

    min_child_weight=11,     # minimal sum hessian in one leaf
    min_data_in_leaf=20,       # Min number of data in one leaf.
    
    subsample=0.75,            # Specifies the fraction of data to be used for each iteration
    subsample_freq=5,
    bagging_seed=9,
    
    colsample_bytree=0.45,  # Specifies the fraction of features to be taken for each iteration
    feature_fraction_seed=7,
    
    min_gain_to_split=0.01,    # Min gain to perform splitting
    reg_alpha=0.00,
    reg_lambda=0.,
#     max_bin=55,
    drop_rate=0.3,
    max_drop=50,
    
    n_jobs=2,
    
)

score = cv(model_lgb, X_train, y_train)
all_models_scores['gradient_boosting'] = score
print('\nLGBM:\n')
print(score)


fold: 1

╒════════════════════╤════════════════╤══════════════════╕
│ metric_name        │   training_set │   validation_set │
╞════════════════════╪════════════════╪══════════════════╡
│ rmse               │          0.088 │            0.250 │
├────────────────────┼────────────────┼──────────────────┤
│ explained_variance │          0.923 │            0.393 │
├────────────────────┼────────────────┼──────────────────┤
│ r_2                │          0.923 │            0.393 │
├────────────────────┼────────────────┼──────────────────┤
│ accuracy           │          0.939 │            0.851 │
├────────────────────┼────────────────┼──────────────────┤
│ tpr                │          0.779 │            0.449 │
├────────────────────┼────────────────┼──────────────────┤
│ specificity        │          0.977 │            0.933 │
├────────────────────┼────────────────┼──────────────────┤
│ fpr                │          0.023 │            0.067 │
├────────────────────┼────────────────┼───────

**MEAN CV scores for each model:**

In [30]:
for k, v in all_models_scores.items():
    print('\n model: {} mean CV score'.format(k))
    print(v)
    print()


 model: lasso mean CV score
╒════════════════════╤═══════════════╤══════════════╤═══════════════╤══════════════╕
│ metric_name        │   train: mean │   train: std │   valid: mean │   valid: std │
╞════════════════════╪═══════════════╪══════════════╪═══════════════╪══════════════╡
│ rmse               │         0.212 │        0.004 │         0.271 │        0.015 │
├────────────────────┼───────────────┼──────────────┼───────────────┼──────────────┤
│ explained_variance │         0.551 │        0.012 │         0.267 │        0.078 │
├────────────────────┼───────────────┼──────────────┼───────────────┼──────────────┤
│ r_2                │         0.551 │        0.012 │         0.266 │        0.078 │
├────────────────────┼───────────────┼──────────────┼───────────────┼──────────────┤
│ accuracy           │         0.860 │        0.005 │         0.832 │        0.024 │
├────────────────────┼───────────────┼──────────────┼───────────────┼──────────────┤
│ tpr                │         0.398

Some insights from scores: 

* As excpected **Lasso** and **ElasticNet** performance are almost similar. Besides this their performance is also low compared to DecisionTree models, which is also expected as we investigated that our data mostly non linear.

* If we compare ExtraTrees and GradientBoosting then we can notice that their performances are comparable: mean validation RMSE is almost equal and AUC is almost equal. We see some difference for tpr (true positive ratio - When it is actually Y, how often does it predict Y?) and fpr (false positive ratio - When it is actually N, how often does it predict Y?) metrics for given 0.7% threshold. As we can see both are bigger for GradientBoosting based model (tpr: 0.448 vs 0.368, fpr: 0.074 vs 0.050). So what does it mean? We can assume that GradientBoosting based model learned more from data than ExtraTrees, so it can differentiate more succesful launches, but together with that the false positive ratio is a bit more 2.4%, which is not good from bussiness prospective. So GradientBoosting based models pays fpr increase cost (by 2.4%) for having higher ptr by 8.0%. 


In [1]:
evaluate_model(model_lgb, X_train, y_train, training=True)
feat_imp = pd.Series(model_lgb.feature_importances_, index=X_train.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

NameError: name 'evaluate_model' is not defined

In [None]:
evaluate_model(model_lgb, X_test, y_test, training=False)

In [None]:
for model in models:
    evaluate_train_test(model, X_train, y_train, X_test, y_test)
    print()

In [None]:
feat_imp = pd.Series(model_extratree.feature_importances_, index=X_train.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

In [None]:
feat_imp = pd.Series(model_lgb.feature_importances_, index=X_train.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

In [None]:
feat_imp = pd.Series(model_xgb.feature_importances_, index=X_train.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10));

In [None]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [None]:
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_split_gain': '{:.4f}'.format(params['min_split_gain']),
        'drop_rate': '{:.4f}'.format(params['drop_rate'])
        
    }

    regressor = lgb.LGBMRegressor(
    boosting_type='dart',
    
    objective='rmse',
    n_estimators=5000,
    learning_rate=0.3,
    
#     max_depth=4,             # Specify the max depth to which tree will grow. 
#     num_leaves=10,             # Number of leaves in one tree

    min_child_weight=11,     # minimal sum hessian in one leaf
    min_data_in_leaf=20,       # Min number of data in one leaf.
    
    subsample=0.75,            # Specifies the fraction of data to be used for each iteration
    subsample_freq=5,
    bagging_seed=9,
    
#     colsample_bytree=0.45,  # Specifies the fraction of features to be taken for each iteration
    feature_fraction_seed=7,
    
#     min_gain_to_split=0.002,    # Min gain to perform splitting
    reg_alpha=0.001,
    reg_lambda=0.,
#     max_bin=55,
#     drop_rate=0.3,
    max_drop=50,
    
    n_jobs=2,
    **params
    )

    score = cv(regressor, X_train, y_train)
    
    print()
    print(params)
    print(score)
    return float(score.splitlines()[3].split()[-4])


space = {
    'max_depth': hp.quniform('max_depth', 4, 6, 1),
    'num_leaves': hp.quniform('num_leaves', 16, 64, 8),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.8),
    'min_split_gain': hp.uniform('min_split_gain', 0.0018, 0.0022),
    'drop_rate': hp.quniform('drop_rate', 0.2, 0.4, 0.05),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30)

In [None]:
float(score.splitlines()[3].split()[-4])

In [None]:
best

In [None]:
best['num_leaves'] = 128