In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import os
import time
import warnings
import gc
gc.collect()
import os
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold
from sklearn.metrics import mean_squared_error
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#Reduce the memory usage - Thanks to Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#Reading data files

hist_transactions = reduce_mem_usage (pd.read_csv('../input/historical_transactions.csv'))
#transactions = reduce_mem_usage (pd.read_csv('../input/new_merchant_transactions.csv'))
gc.collect()


In [None]:
hist_transactions.head()
#hist_transactions = old_transactions[old_transactions['card_id']=='C_ID_4e6213e9bc']
#transactions = new_transactions[new_transactions['card_id']=='C_ID_4e6213e9bc']
#del old_transactions
#del new_transactions
gc.collect

In [None]:
#Replacing NAN values
hist_transactions['category_3'] = hist_transactions['category_3'].replace(np.nan, 'A')
hist_transactions['merchant_id'] = hist_transactions['merchant_id'].replace(np.nan, 'M_ID_00a6ca8a8a')
hist_transactions['category_2'] = hist_transactions['category_2'].replace(np.nan, 1.0)
#Converting  -- for now not running
hist_transactions['authorized_flag'] = hist_transactions['authorized_flag'].map({'Y': 1, 'N': 0})
hist_transactions['category_1'] = hist_transactions['category_1'].map({'Y': 1, 'N': 0})
hist_transactions['category_3'] = hist_transactions['category_3'].map({'A':0, 'B':1, 'C':2})

hist_transactions['purchase_date'] = pd.to_datetime(hist_transactions['purchase_date'])
hist_transactions['year'] = hist_transactions['purchase_date'].dt.year
hist_transactions['weekofyear'] = hist_transactions['purchase_date'].dt.weekofyear
hist_transactions['month'] = hist_transactions['purchase_date'].dt.month
hist_transactions['dayofweek'] = hist_transactions['purchase_date'].dt.dayofweek
hist_transactions['weekend'] = (hist_transactions.purchase_date.dt.weekday >=5).astype(int)
hist_transactions['hour'] = hist_transactions['purchase_date'].dt.hour 
hist_transactions['quarter'] = hist_transactions['purchase_date'].dt.quarter
hist_transactions['is_month_start'] = hist_transactions['purchase_date'].dt.is_month_start
hist_transactions['month_diff'] = ((datetime.datetime.today() - hist_transactions['purchase_date']).dt.days)//30
hist_transactions['month_diff'] += hist_transactions['month_lag'] 

agg_values = {
    'authorized_flag': ['nunique'],
    'city_id' : ['nunique'], 
    'category_1' : ['nunique'],
    'installments' : ['mean', 'max', 'min', 'var', 'std', 'sum'],
    'category_3' : ['nunique'],
    'merchant_category_id' : ['nunique'], 
     'month_lag' : ['mean', 'max', 'min', 'var', 'std', 'sum'],
    'purchase_amount' : ['mean', 'max', 'min', 'var', 'std', 'sum'], 
     'category_2' : ['nunique'],
    'state_id' : ['nunique'],
    'subsector_id' : ['nunique'], 
    'year':  ['nunique'], 
    'weekofyear':  ['nunique'], 
    'month':  ['nunique'], 
    'dayofweek':  ['nunique'], 
    'weekend':  ['nunique'], 
     'hour':  ['nunique'], 
     'quarter':  ['nunique'], 
     'is_month_start':  ['nunique'], 
     'month_diff': ['mean', 'max', 'min', 'var', 'std', 'nunique'],
    'purchase_date' : ['max','min'],
    'card_id' :['count']
}

hist_aggregate = hist_transactions \
    .groupby('card_id') \
    .agg(agg_values) \
    .reset_index()

del hist_transactions
gc.collect()

hist_aggregate.columns = ['col_' + '_'.join(col).strip() 
                            for col in hist_aggregate.columns.values]

hist_aggregate.rename(columns={'col_card_id_':'card_id'}, inplace=True)


In [None]:
train  = reduce_mem_usage (pd.read_csv('../input/train.csv',parse_dates=["first_active_month"]))
test = reduce_mem_usage (pd.read_csv('../input/test.csv',parse_dates=["first_active_month"]))
#Histogram to get the distribution and Outlier
train['target'].plot(kind='hist') 

In [None]:
# Remove the Outliers if any -- Adding a new column
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

In [None]:
# Now extract the month, year, day, weekday
train["month"] = train["first_active_month"].dt.month
train["year"] = train["first_active_month"].dt.year
train['week'] = train["first_active_month"].dt.weekofyear
train['dayofweek'] = train['first_active_month'].dt.dayofweek
train['days'] = (datetime.date(2018, 2, 1) - train['first_active_month'].dt.date).dt.days
train['quarter'] = train['first_active_month'].dt.quarter
train['is_month_start'] = train['first_active_month'].dt.is_month_start


test["month"] = test["first_active_month"].dt.month
test["year"] = test["first_active_month"].dt.year
test['week'] = test["first_active_month"].dt.weekofyear
test['dayofweek'] = test['first_active_month'].dt.dayofweek
test['days'] = (datetime.date(2018, 2, 1) - test['first_active_month'].dt.date).dt.days
test['quarter'] = test['first_active_month'].dt.quarter
test['is_month_start'] = test['first_active_month'].dt.is_month_start

In [None]:
train = pd.merge(train, hist_aggregate, on='card_id',how='left')
test = pd.merge(test, hist_aggregate, on='card_id',how='left')

In [None]:
transactions = reduce_mem_usage (pd.read_csv('../input/new_merchant_transactions.csv'))

In [None]:
transactions.head()

In [None]:
#Replacing NAN values
transactions['category_3'] = transactions['category_3'].replace(np.nan, 'A')
transactions['merchant_id'] = transactions['merchant_id'].replace(np.nan, 'M_ID_00a6ca8a8a')
transactions['category_2'] = transactions['category_2'].replace(np.nan, 1.0)
#Converting  -- for now not running
transactions['authorized_flag'] = transactions['authorized_flag'].map({'Y': 1, 'N': 0})
transactions['category_1'] = transactions['category_1'].map({'Y': 1, 'N': 0})
transactions['category_3'] = transactions['category_3'].map({'A':0, 'B':1, 'C':2})

transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])
transactions['year'] = transactions['purchase_date'].dt.year
transactions['weekofyear'] = transactions['purchase_date'].dt.weekofyear
transactions['month'] = transactions['purchase_date'].dt.month
transactions['dayofweek'] = transactions['purchase_date'].dt.dayofweek
transactions['weekend'] = (transactions.purchase_date.dt.weekday >=5).astype(int)
transactions['hour'] = transactions['purchase_date'].dt.hour 
transactions['quarter'] = transactions['purchase_date'].dt.quarter
transactions['is_month_start'] = transactions['purchase_date'].dt.is_month_start
transactions['month_diff'] = ((datetime.datetime.today() - transactions['purchase_date']).dt.days)//30
transactions['month_diff'] += transactions['month_lag'] 

agg_values = {
    'authorized_flag': ['nunique'],
    'city_id' : ['nunique'], 
    'category_1' : ['nunique'],
    'installments' : ['mean', 'max', 'min', 'var', 'std', 'sum'],
    'category_3' : ['nunique'],
    'merchant_category_id' : ['nunique'], 
     'month_lag' : ['mean', 'max', 'min', 'var', 'std', 'sum'],
    'purchase_amount' : ['mean', 'max', 'min', 'var', 'std', 'sum'], 
     'category_2' : ['nunique'],
    'state_id' : ['nunique'],
    'subsector_id' : ['nunique'], 
    'year':  ['nunique'], 
    'weekofyear':  ['nunique'], 
    'month':  ['nunique'], 
    'dayofweek':  ['nunique'], 
    'weekend':  ['nunique'], 
     'hour':  ['nunique'], 
     'quarter':  ['nunique'], 
     'is_month_start':  ['nunique'], 
     'month_diff': ['mean', 'max', 'min', 'var', 'std', 'nunique'],
    'purchase_date' : ['max','min'],
    'card_id' :['count']
}

trans_aggregate = transactions \
    .groupby('card_id') \
    .agg(agg_values) \
    .reset_index()

In [None]:
trans_aggregate.columns = ['trans_' + '_'.join(col).strip() 
                            for col in trans_aggregate.columns.values]

trans_aggregate.rename(columns={'trans_card_id_':'card_id'}, inplace=True)

In [None]:
train = pd.merge(train, trans_aggregate, on='card_id',how='left')
test = pd.merge(test, trans_aggregate, on='card_id',how='left')

In [None]:
del trans_aggregate
gc.collect()

In [None]:
FEATS_EXCLUDED= ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_count',
                  'trans_purchase_date_max', 'trans_purchase_date_min', 'trans_card_id_count','col_purchase_date_max_y',
                'col_purchase_date_min_y', 'col_card_id_count_y','col_purchase_date_max_x','col_purchase_date_min_x',
                'col_purchase_date_max','col_purchase_date_min']

In [None]:
train.head()

In [None]:
#Repeated Kfold yield better results than StratifiedKFold
#folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=326)
folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4950)

In [None]:
print("Starting LightGBM. Train shape: {}, test shape: {}".format(train.shape, test.shape))
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train.columns if f not in FEATS_EXCLUDED]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train['outliers'])):
#for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats])):
        train_x, train_y = train[feats].iloc[train_idx], train['target'].iloc[train_idx]
        valid_x, valid_y = train[feats].iloc[valid_idx], train['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

       
        params ={
                'task': 'train',
                'boosting': 'goss',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'subsample': 0.9855232997390695,
                'max_depth': 7,
                'top_rate': 0.9064148448434349,
                'num_leaves': 63,
                'min_child_weight': 41.9612869171337,
                'other_rate': 0.0721768246018207,
                'reg_alpha': 9.677537745007898,
                'colsample_bytree': 0.5665320670155495,
                'min_split_gain': 9.820197773625843,
                'reg_lambda': 8.2532317400459,
                'min_data_in_leaf': 21,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test[feats], num_iteration=reg.best_iteration) / (5*2)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

In [None]:
submission_file_name= "submission_v4.csv"

In [None]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

In [None]:
display_importances(feature_importance_df)

In [None]:
test.loc[:,'target'] = sub_preds
test = test.reset_index()
test[['card_id', 'target']].to_csv(submission_file_name, index=False)