In [8]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import math
from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
import prettytable
import lightgbm as lgbm
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import datetime
import gc
from sklearn.pipeline import Pipeline

In [2]:
#https://www.kaggle.com/hiralmshah/reduce-memory-usage-trick-with-elo-merchant-data
def reduce_memory(dataframe, verbose =True):
    numerical = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    initial_memory = dataframe.memory_usage().sum()/1024**2
    for col in dataframe.columns:
        col_type = dataframe[col].dtypes
        if col_type == object:
            dataframe[col] = dataframe[col].astype('category')
        if col_type in numerical:
            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            if str(col_type)[:3] =='int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataframe[col] = dataframe[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataframe[col] = dataframe[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataframe[col] = dataframe[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataframe[col] = dataframe[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dataframe[col] = dataframe[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataframe[col] = dataframe[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    dataframe[col] = dataframe[col].astype(np.float64)
    final_memory = dataframe.memory_usage().sum()/ 1024**2
    if verbose:
        print('Memory usage reduced to {:5.2f} Mb({:.1f}% reduction)'.format(final_memory, (initial_memory-final_memory)/initial_memory*100))
    return dataframe

In [3]:
def train_featurization(train, test):
    #imputing missing value with mode
    test['first_active_month'].fillna(test['first_active_month'].mode()[0], inplace = True)
    #converting object datatype to datetime to extract more features
    train['first_active_month'] = pd.to_datetime(train['first_active_month'])
    test['first_active_month'] = pd.to_datetime(test['first_active_month'])
    #creating a new feature outlier based on loyalty score
    train['outliers'] = 0
    train.loc[train['target'] < -30, 'outliers'] =1
    #mean encoding categorical features
    for feature in ['feature_1', 'feature_2', 'feature_3']:
        mean_value = train.groupby(feature)['target'].mean()
        train[feature] = train[feature].map(mean_value)
        test[feature] = test[feature].map(mean_value)
    train['date_elapsed'] = (datetime.datetime.today() - train['first_active_month']).dt.days
    train['first_month'] = train['first_active_month'].dt.month
    train['first_year'] = train['first_active_month'].dt.year
    train['first_week'] = train['first_active_month'].dt.dayofweek
    train['first_quarter'] = train['first_active_month'].dt.quarter
    train['date_elapsed_feature_1'] = train['date_elapsed'] * train['feature_1']
    train['date_elapsed_feature_2'] = train['date_elapsed'] * train['feature_2']
    train['date_elapsed_feature_3'] = train['date_elapsed'] * train['feature_3']

    train['date_elapsed_feature_1_ratio'] = train['date_elapsed'] / train['feature_1']
    train['date_elapsed_feature_2_ratio'] = train['date_elapsed'] / train['feature_2']
    train['date_elapsed_feature_3_ratio'] = train['date_elapsed'] / train['feature_3']

    train['feature_sum'] = train['feature_1'] + train['feature_2'] + train['feature_3']
    train['feature_mean'] = train[['feature_1', 'feature_2', 'feature_3']].mean(axis = 1)
    train['feature_max'] = train[['feature_1', 'feature_2', 'feature_3']].max(axis = 1)
    train['feature_min'] = train[['feature_1', 'feature_2', 'feature_3']].min(axis = 1)
    train['feature_var'] = train[['feature_1', 'feature_2', 'feature_3']].std(axis = 1)
    
    test['date_elapsed'] = (datetime.datetime.today() - test['first_active_month']).dt.days
    test['first_month'] = test['first_active_month'].dt.month
    test['first_year'] = test['first_active_month'].dt.year
    test['first_week'] = test['first_active_month'].dt.dayofweek
    test['first_quarter'] = test['first_active_month'].dt.quarter
    
    test['date_elapsed_feature_1'] = test['date_elapsed'] * test['feature_1']
    test['date_elapsed_feature_2'] = test['date_elapsed'] * test['feature_2']
    test['date_elapsed_feature_3'] = test['date_elapsed'] * test['feature_3']

    test['date_elapsed_feature_1_ratio'] = test['date_elapsed'] / test['feature_1']
    test['date_elapsed_feature_2_ratio'] = test['date_elapsed'] / test['feature_2']
    test['date_elapsed_feature_3_ratio'] = test['date_elapsed'] / test['feature_3']

    test['feature_sum'] = test['feature_1'] + test['feature_2'] + test['feature_3']
    test['feature_mean'] = test[['feature_1', 'feature_2', 'feature_3']].mean(axis = 1)
    test['feature_max'] = test[['feature_1', 'feature_2', 'feature_3']].max(axis = 1)
    test['feature_min'] = test[['feature_1', 'feature_2', 'feature_3']].min(axis = 1)
    test['feature_var'] = test[['feature_1', 'feature_2', 'feature_3']].std(axis = 1)
    
    gc.collect()
    return train,test

In [4]:
def historical_transactions_featurization(historical_transactions):
    # imputing missing values in historical_transactions
    historical_transactions['category_3'].fillna(historical_transactions['category_3'].mode()[0], inplace = True)
    historical_transactions['merchant_id'].fillna(historical_transactions['merchant_id'].mode()[0], inplace = True)
    historical_transactions['category_2'].fillna(historical_transactions['category_2'].mode()[0], inplace = True)
    
    #encoding categorical features
    historical_transactions['category_1'] = historical_transactions['category_1'].map({'Y':1, 'N':0}).astype(int)
    historical_transactions['category_3'] =historical_transactions['category_3'].map({'A':0, 'B':1, 'C':2}).astype(int)
    historical_transactions['authorized_flag'] = historical_transactions['authorized_flag'].map({'Y':1, 'N':0}).astype(int)
    
    historical_transactions['installments'].replace(-1, np.nan, inplace = True)
    historical_transactions['installments'].replace(999, np.nan, inplace = True)
    historical_transactions['installments'].fillna(historical_transactions['installments'].mode()[0], inplace = True)
    
    historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
    historical_transactions['purchase_year'] = historical_transactions['purchase_date'].dt.year
    historical_transactions['purchase_day'] = historical_transactions['purchase_date'].dt.day
    historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
    historical_transactions['purchase_week'] = historical_transactions['purchase_date'].dt.week
    historical_transactions['purchase_day'] = historical_transactions['purchase_date'].dt.day
    historical_transactions['purchase_dayofweek'] = historical_transactions['purchase_date'].dt.dayofweek
    historical_transactions['purchase_dayofyear'] = historical_transactions['purchase_date'].dt.dayofyear
    historical_transactions['purchase_hour'] = historical_transactions['purchase_date'].dt.hour
    historical_transactions['purchase_minute'] = historical_transactions['purchase_date'].dt.minute
    historical_transactions['purchase_second'] = historical_transactions['purchase_date'].dt.second
    historical_transactions['purchase_month_end'] = historical_transactions['purchase_date'].dt.is_month_end.astype(int)
    historical_transactions['purchase_month_start'] = historical_transactions['purchase_date'].dt.is_month_start.astype(int)
    historical_transactions['purchase_quarter_start'] = historical_transactions['purchase_date'].dt.is_quarter_start.astype(int)
    historical_transactions['purchase_quarter_end'] = historical_transactions['purchase_date'].dt.is_quarter_end.astype(int)
    historical_transactions['purchase_year_start'] = historical_transactions['purchase_date'].dt.is_year_start.astype(int)
    historical_transactions['purchase_year_end'] = historical_transactions['purchase_date'].dt.is_year_end.astype(int)
    
    historical_transactions['purchase_is_weekend'] = (historical_transactions.purchase_dayofweek >=5).astype(int)
    historical_transactions['purchase_is_weekday'] = (historical_transactions.purchase_dayofweek <5).astype(int)
    historical_transactions['month_difference'] = ((datetime.datetime.today() - historical_transactions['purchase_date']).dt.days)//30
    historical_transactions['month_difference'] += historical_transactions['month_lag']
    #denormalizing purchase amount
    historical_transactions['purchase_amount'] = np.round(historical_transactions['purchase_amount']/0.00150265118 + 497.06 , 2)
    
    historical_transactions = reduce_memory(historical_transactions)
    
    #aggregation with grouping by card id
    aggregation_dict = {'card_id' : ['size'],
                    'city_id': ['nunique'],
                   'category_1': ['max', 'min', 'sum', 'mean'],
                   'installments' : ['sum', 'max', 'min', 'mean', 'var','skew'],
                   'category_3' : ['sum', 'mean'],
                   'merchant_category_id': ['nunique'],
                   'merchant_id': ['nunique'],
                   'month_lag': ['sum', 'max', 'min', 'var', 'mean', 'skew'],
                   'purchase_amount' : ['sum', 'mean', 'var', 'max', 'min', 'skew'],
                   'purchase_date' : ['max', 'min'],
                   'category_2': ['sum', 'mean'],
                   'state_id' : ['nunique'],
                   'subsector_id' : ['nunique'],
                   'purchase_year' : ['max', 'min', 'nunique'],
                   'purchase_day' : ['max', 'min', 'nunique'],
                   'purchase_month' : ['max', 'min', 'nunique'],
                   'purchase_week' : ['max', 'min', 'nunique'],
                   'purchase_dayofweek' : ['max', 'min', 'nunique','mean'],
                   'purchase_dayofyear' : ['max', 'min', 'nunique', 'mean'],
                   'purchase_hour' : ['max', 'min', 'nunique', 'mean'],
                   'purchase_minute': ['max', 'min', 'nunique', 'mean'],
                   'purchase_second': ['max', 'min', 'nunique', 'mean'],
                   'purchase_month_end' : ['nunique', 'mean'],
                   'purchase_month_start' : ['nunique', 'mean'],
                   'purchase_quarter_start' : ['nunique', 'mean'],
                   'purchase_quarter_end' : ['nunique', 'mean'],
                   'purchase_year_start': ['nunique', 'mean'],
                   'purchase_year_end': ['nunique', 'mean'],
                   'purchase_is_weekday' : ['nunique', 'mean'],
                   'purchase_is_weekend' : ['nunique', 'mean'],
                   'month_difference' : ['max', 'min', 'mean', 'var', 'skew']}
    historical_transactions_aggregated = historical_transactions.groupby('card_id').agg(aggregation_dict)
    historical_transactions_aggregated.columns = ['trans_' + '_'.join(col) for col in historical_transactions_aggregated.columns.values]
    historical_transactions_aggregated.reset_index(inplace = True)
    
    historical_transactions_aggregated['trans_purchase_max_min'] = (historical_transactions_aggregated['trans_purchase_date_max'] - historical_transactions_aggregated['trans_purchase_date_min']).dt.days
    historical_transactions_aggregated['trans_purchase_date_uptomax'] = (datetime.datetime.today() - historical_transactions_aggregated['trans_purchase_date_max']).dt.days
    historical_transactions_aggregated['trans_purchase_date_uptomin'] = (datetime.datetime.today() - historical_transactions_aggregated['trans_purchase_date_min']).dt.days
    
    gc.collect()
    return historical_transactions_aggregated

In [5]:
def new_merchants_featurization(new_merchant_trans):
    # imputing missing values in new_merchant_transactions
    new_merchant_trans['category_3'].fillna(new_merchant_trans['category_3'].mode()[0], inplace = True)
    new_merchant_trans['merchant_id'].fillna(new_merchant_trans['merchant_id'].mode()[0], inplace = True)
    new_merchant_trans['category_2'].fillna(new_merchant_trans['category_2'].mode()[0], inplace = True)

    #encoding categorical features
    new_merchant_trans['category_1'] = new_merchant_trans['category_1'].map({'Y':1, 'N':0}).astype(int)
    new_merchant_trans['category_3'] = new_merchant_trans['category_3'].map({'A':0, 'B':1, 'C':2}).astype(int)
    new_merchant_trans['authorized_flag'] = new_merchant_trans['authorized_flag'].map({'Y':1, 'N':0}).astype(int)
    
    new_merchant_trans['installments'].replace(-1, np.nan, inplace = True)
    new_merchant_trans['installments'].replace(999, np.nan, inplace = True)
    new_merchant_trans['installments'].fillna(new_merchant_trans['installments'].mode()[0], inplace = True)
    
    new_merchant_trans['purchase_date'] = pd.to_datetime(new_merchant_trans['purchase_date'])
    new_merchant_trans['purchase_year'] = new_merchant_trans['purchase_date'].dt.year
    new_merchant_trans['purchase_day'] = new_merchant_trans['purchase_date'].dt.day
    new_merchant_trans['purchase_month'] = new_merchant_trans['purchase_date'].dt.month
    new_merchant_trans['purchase_week'] = new_merchant_trans['purchase_date'].dt.week
    new_merchant_trans['purchase_day'] = new_merchant_trans['purchase_date'].dt.day
    new_merchant_trans['purchase_dayofweek'] = new_merchant_trans['purchase_date'].dt.dayofweek
    new_merchant_trans['purchase_dayofyear'] = new_merchant_trans['purchase_date'].dt.dayofyear
    new_merchant_trans['purchase_hour'] = new_merchant_trans['purchase_date'].dt.hour
    new_merchant_trans['purchase_minute'] = new_merchant_trans['purchase_date'].dt.minute
    new_merchant_trans['purchase_second'] = new_merchant_trans['purchase_date'].dt.second
    new_merchant_trans['purchase_month_end'] = new_merchant_trans['purchase_date'].dt.is_month_end.astype(int)
    new_merchant_trans['purchase_month_start'] = new_merchant_trans['purchase_date'].dt.is_month_start.astype(int)
    new_merchant_trans['purchase_quarter_start'] = new_merchant_trans['purchase_date'].dt.is_quarter_start.astype(int)
    new_merchant_trans['purchase_quarter_end'] =new_merchant_trans['purchase_date'].dt.is_quarter_end.astype(int)
    new_merchant_trans['purchase_year_start'] = new_merchant_trans['purchase_date'].dt.is_year_start.astype(int)
    new_merchant_trans['purchase_year_end'] = new_merchant_trans['purchase_date'].dt.is_year_end.astype(int)
    
    new_merchant_trans['purchase_is_weekend'] = (new_merchant_trans.purchase_dayofweek >=5).astype(int)
    new_merchant_trans['purchase_is_weekday'] = (new_merchant_trans.purchase_dayofweek <5).astype(int)
    new_merchant_trans['month_difference'] = ((datetime.datetime.today() - new_merchant_trans['purchase_date']).dt.days)//30
    new_merchant_trans['month_difference'] += new_merchant_trans['month_lag']
    # denormalizing purchase amount
    new_merchant_trans['purchase_amount'] = np.round(new_merchant_trans['purchase_amount']/0.00150265118 + 497.06, 2)
    
    new_merchant_trans = reduce_memory(new_merchant_trans)
    
    #aggregating historical_transactions by card_id
    aggregation_dict = {'card_id' : ['size'],
                    'city_id': ['nunique'],
                   'category_1': ['max', 'min', 'sum', 'mean'],
                   'installments' : ['sum', 'max', 'min', 'mean', 'var','skew'],
                   'category_3' : ['sum', 'mean'],
                   'merchant_category_id': ['nunique'],
                   'merchant_id': ['nunique'],
                   'month_lag': ['sum', 'max', 'min', 'var', 'mean', 'skew'],
                   'purchase_amount' : ['sum', 'mean', 'var', 'max', 'min', 'skew'],
                   'purchase_date' : ['max', 'min'],
                   'category_2': ['sum', 'mean'],
                   'state_id' : ['nunique'],
                   'subsector_id' : ['nunique'],
                   'purchase_year' : ['max', 'min', 'nunique'],
                   'purchase_day' : ['max', 'min', 'nunique'],
                   'purchase_month' : ['max', 'min', 'nunique'],
                   'purchase_week' : ['max', 'min', 'nunique'],
                   'purchase_dayofweek' : ['max', 'min', 'nunique','mean'],
                   'purchase_dayofyear' : ['max', 'min', 'nunique', 'mean'],
                   'purchase_hour' : ['max', 'min', 'nunique', 'mean'],
                   'purchase_minute': ['max', 'min', 'nunique', 'mean'],
                   'purchase_second': ['max', 'min', 'nunique', 'mean'],
                   'purchase_month_end' : ['nunique', 'mean'],
                   'purchase_month_start' : ['nunique', 'mean'],
                   'purchase_quarter_start' : ['nunique', 'mean'],
                   'purchase_quarter_end' : ['nunique', 'mean'],
                   'purchase_year_start': ['nunique', 'mean'],
                   'purchase_year_end': ['nunique', 'mean'],
                   'purchase_is_weekday' : ['nunique', 'mean'],
                   'purchase_is_weekend' : ['nunique', 'mean'],
                   'month_difference' : ['max', 'min', 'mean', 'var', 'skew']}
    new_merchant_trans_aggregated =  new_merchant_trans.groupby('card_id').agg(aggregation_dict)
    new_merchant_trans_aggregated.columns = ['new_' + '_'.join(col) for col in new_merchant_trans_aggregated.columns.values]
    new_merchant_trans_aggregated.reset_index(inplace = True)
    new_merchant_trans_aggregated['new_purchase_max_min'] = (new_merchant_trans_aggregated['new_purchase_date_max'] - new_merchant_trans_aggregated['new_purchase_date_min']).dt.days
    new_merchant_trans_aggregated['new_purchase_date_uptomax'] = (datetime.datetime.today() - new_merchant_trans_aggregated['new_purchase_date_max']).dt.days
    new_merchant_trans_aggregated['new_purchase_date_uptomin'] = (datetime.datetime.today() - new_merchant_trans_aggregated['new_purchase_date_min']).dt.days
    
    gc.collect
    return new_merchant_trans_aggregated

In [6]:
def final_featurization(train):
    print('loading test data....')
    test = pd.read_csv('test.csv')
    print('loading historical data....')
    historical_transactions = reduce_memory(pd.read_csv('historical_transactions.csv'))
    print('loading new merchants data....')
    new_merchant_trans = reduce_memory(pd.read_csv('new_merchant_transactions.csv'))
    print('Featurizing train and test....')
    train_feat, test_feat = train_featurization(train, test)
    print('Featurizing historical transactions....')
    hist_aggregated = historical_transactions_featurization(historical_transactions)
    print('Featurizing new merchant transactions....')
    new_aggregated = new_merchants_featurization(new_merchant_trans)
    print('Merging data....')
    # merging train and aggregated historical transactions
    train_hist = pd.merge(train, hist_aggregated, how = 'left', on = 'card_id')
    test_hist = pd.merge(test, hist_aggregated, how = 'left', on = 'card_id')
    # merging again with aggregated new merchants data
    train_trans = pd.merge(train_hist, new_aggregated, how = 'left', on = 'card_id')
    test_trans = pd.merge(test_hist, new_aggregated, how = 'left', on = 'card_id')
    print('Featurizing....')
    train_trans['trans_purchase_date_max'] = pd.to_datetime(train_trans['trans_purchase_date_max'])
    train_trans['trans_purchase_date_min'] = pd.to_datetime(train_trans['trans_purchase_date_min'])
    train_trans['new_purchase_date_max'] = pd.to_datetime(train_trans['new_purchase_date_max'])
    train_trans['new_purchase_date_min'] = pd.to_datetime(train_trans['new_purchase_date_min'])

    test_trans['trans_purchase_date_max'] = pd.to_datetime(test_trans['trans_purchase_date_max'])
    test_trans['trans_purchase_date_min'] = pd.to_datetime(test_trans['trans_purchase_date_min'])
    test_trans['new_purchase_date_max'] = pd.to_datetime(test_trans['new_purchase_date_max'])
    test_trans['new_purchase_date_min'] = pd.to_datetime(test_trans['new_purchase_date_min'])
    
    #https://www.kaggle.com/chauhuynh/my-first-kernel-3-699
    train_trans['trans_first_buy'] = train_trans['trans_purchase_date_min'] - train_trans['first_active_month']
    train_trans['new_first_buy'] = train_trans['new_purchase_date_min'] - train_trans['first_active_month']

    train_trans['card_id_total'] = train_trans['trans_card_id_size'] + train_trans['new_card_id_size']
    train_trans['card_id_ratio'] = train_trans['trans_card_id_size'] / train_trans['new_card_id_size']

    train_trans['purchase_amount_total'] = train_trans['trans_purchase_amount_sum'] + train_trans['new_purchase_amount_sum']
    train_trans['purchase_amount_mean'] = train_trans['trans_purchase_amount_mean'] + train_trans['new_purchase_amount_mean']
    train_trans['purchase_amount_var'] = train_trans['trans_purchase_amount_var'] + train_trans['new_purchase_amount_var']
    train_trans['purchase_amount_max'] = train_trans['trans_purchase_amount_max'] + train_trans['new_purchase_amount_max']
    train_trans['purchase_amount_min'] = train_trans['trans_purchase_amount_min'] + train_trans['new_purchase_amount_min']
    train_trans['purchase_amount_skew'] = train_trans['trans_purchase_amount_skew'] + train_trans['new_purchase_amount_skew']

    train_trans['installments_total'] = train_trans['trans_installments_sum'] + train_trans['new_installments_sum']
    train_trans['installments_max'] = train_trans['trans_installments_max'] + train_trans['new_installments_max']
    train_trans['installments_min'] = train_trans['trans_installments_min'] + train_trans['new_installments_min']
    train_trans['installments_mean'] = train_trans['trans_installments_mean'] + train_trans['new_installments_mean']
    train_trans['installments_var'] = train_trans['trans_installments_var'] + train_trans['new_installments_var']
    train_trans['installments_skew'] = train_trans['trans_installments_skew'] + train_trans['new_installments_skew']

    train_trans['month_lag_total'] = train_trans['trans_month_lag_sum'] + train_trans['new_month_lag_sum']
    train_trans['month_lag_max'] = train_trans['trans_month_lag_max'] + train_trans['new_month_lag_max']
    train_trans['month_lag_min'] = train_trans['trans_month_lag_min'] + train_trans['new_month_lag_min']
    train_trans['month_lag_mean'] = train_trans['trans_month_lag_mean'] + train_trans['new_month_lag_mean']
    train_trans['month_lag_var'] = train_trans['trans_month_lag_var'] + train_trans['new_month_lag_var']
    train_trans['month_lag_skew'] = train_trans['trans_month_lag_skew'] + train_trans['new_month_lag_skew']

    train_trans['month_diff_max'] = train_trans['trans_month_difference_max'] + train_trans['new_month_difference_max']
    train_trans['month_diff_min'] = train_trans['trans_month_difference_min'] + train_trans['new_month_difference_min']
    train_trans['month_diff_mean'] = train_trans['trans_month_difference_mean'] + train_trans['new_month_difference_mean']
    train_trans['month_diff_var'] = train_trans['trans_month_difference_var'] + train_trans['new_month_difference_var']
    train_trans['month_diff_skew'] = train_trans['trans_month_difference_skew'] + train_trans['new_month_difference_skew']
    
    test_trans['trans_first_buy'] = test_trans['trans_purchase_date_min'] - test_trans['first_active_month']
    test_trans['new_first_buy'] = test_trans['new_purchase_date_min'] - test_trans['first_active_month']

    test_trans['card_id_total'] = test_trans['trans_card_id_size'] + test_trans['new_card_id_size']
    test_trans['card_id_ratio'] = test_trans['trans_card_id_size'] / test_trans['new_card_id_size']

    test_trans['purchase_amount_total'] = test_trans['trans_purchase_amount_sum'] + test_trans['new_purchase_amount_sum']
    test_trans['purchase_amount_mean'] = test_trans['trans_purchase_amount_mean'] + test_trans['new_purchase_amount_mean']
    test_trans['purchase_amount_var'] = test_trans['trans_purchase_amount_var'] + test_trans['new_purchase_amount_var']
    test_trans['purchase_amount_max'] = test_trans['trans_purchase_amount_max'] + test_trans['new_purchase_amount_max']
    test_trans['purchase_amount_min'] = test_trans['trans_purchase_amount_min'] + test_trans['new_purchase_amount_min']
    test_trans['purchase_amount_skew'] = test_trans['trans_purchase_amount_skew'] + test_trans['new_purchase_amount_skew']

    test_trans['installments_total'] = test_trans['trans_installments_sum'] + test_trans['new_installments_sum']
    test_trans['installments_max'] = test_trans['trans_installments_max'] + test_trans['new_installments_max']
    test_trans['installments_min'] = test_trans['trans_installments_min'] + test_trans['new_installments_min']
    test_trans['installments_mean'] = test_trans['trans_installments_mean'] + test_trans['new_installments_mean']
    test_trans['installments_var'] = test_trans['trans_installments_var'] + test_trans['new_installments_var']
    test_trans['installments_skew'] = test_trans['trans_installments_skew'] + test_trans['new_installments_skew']

    test_trans['month_lag_total'] = test_trans['trans_month_lag_sum'] + test_trans['new_month_lag_sum']
    test_trans['month_lag_max'] = test_trans['trans_month_lag_max'] + test_trans['new_month_lag_max']
    test_trans['month_lag_min'] = test_trans['trans_month_lag_min'] + test_trans['new_month_lag_min']
    test_trans['month_lag_mean'] = test_trans['trans_month_lag_mean'] + test_trans['new_month_lag_mean']
    test_trans['month_lag_var'] = test_trans['trans_month_lag_var'] + test_trans['new_month_lag_var']
    test_trans['month_lag_skew'] = test_trans['trans_month_lag_skew'] + test_trans['new_month_lag_skew']

    test_trans['month_diff_max'] = test_trans['trans_month_difference_max'] + test_trans['new_month_difference_max']
    test_trans['month_diff_min'] = test_trans['trans_month_difference_min'] + test_trans['new_month_difference_min']
    test_trans['month_diff_mean'] = test_trans['trans_month_difference_mean'] + test_trans['new_month_difference_mean']
    test_trans['month_diff_var'] = test_trans['trans_month_difference_var'] + test_trans['new_month_difference_var']
    test_trans['month_diff_skew'] = test_trans['trans_month_difference_skew'] + test_trans['new_month_difference_skew']
    # replacing inf values
    train_trans.replace([-np.inf, np.inf], np.nan, inplace = True)
    test_trans.replace([-np.inf, np.inf], np.nan, inplace = True)
    # imputing missing values with mode
    train_na = train_trans.columns[train_trans.isna().any()]
    test_na = test_trans.columns[test_trans.isna().any()]
    for i in range(len(train_na)):
        train_trans[train_na[i]].fillna(train_trans[train_na[i]].mode()[0], inplace = True)
        test_trans[train_na[i]].fillna(test_trans[train_na[i]].mode()[0], inplace = True)
    s = train_trans.select_dtypes(include = ['datetime64[ns]']).columns
    train_trans = train_trans.drop(s, axis = 1)
    test_trans = test_trans.drop(s, axis = 1)
    ns = train_trans.select_dtypes(include = ['timedelta64[ns]']).columns
    for n in ns:
        train_trans[n] = train_trans[n].astype(int)
        test_trans[n] = test_trans[n].astype(int)
    cols = [col for col in train_trans.columns if col not in ['card_id', 'outliers', 'target']]
    return train_trans[cols]

In [10]:
pipe = Pipeline([('preprocessing', final_featurization(da)), ('model', pd.read_pickle('best_model.pkl'))])
pipe

NameError: name 'data' is not defined

In [7]:
def predict_function(train):
    features = final_featurization(train)
    model = pd.read_pickle('best_model.pkl')
    predictions = model.predict(features)
    print('Predicted loyalty score is : ', predictions)

In [20]:
train = pd.read_csv('train.csv')

In [21]:
predict_function(train, train['target'])

loading test data....
loading historical data....
Memory usage reduced to 1622.97 Mb(47.8% reduction)
loading new merchants data....
Memory usage reduced to 169.08 Mb(19.4% reduction)
Featurizing train and test....
Featurizing historical transactions....




Memory usage reduced to 1524.21 Mb(73.8% reduction)
Featurizing new merchant transactions....


  result = self._values.round(decimals)


Memory usage reduced to 121.29 Mb(70.0% reduction)
Merging data....
Featurizing....
Predicted loyalty score is :  [-0.31623899 -0.71026829  0.50407203 ... -0.42776232 -2.03740717
 -0.47288799]
rmse is :  3.4913335284219307
