# Load packages

In [1]:
import os
import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

#import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.simplefilter('ignore', FutureWarning)

from xgboost import XGBRegressor

# Load datasets

In [2]:
train = pd.read_csv('train.csv', parse_dates=["first_active_month"])
test = pd.read_csv('test.csv', parse_dates=["first_active_month"])
sample_submission = pd.read_csv('sample_submission.csv')

In [7]:
train.shape, test.shape, sample_submission.shape

((201917, 6), (123623, 5), (123623, 2))

In [8]:
train.head(10)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749
5,2016-09-01,C_ID_0894217f2f,4,2,0,0.871585
6,2016-12-01,C_ID_7e63323c00,3,2,1,0.230129
7,2017-09-01,C_ID_dfa21fc124,3,2,1,2.13585
8,2017-08-01,C_ID_fe0fdac8ea,2,1,0,-0.065406
9,2016-08-01,C_ID_bf62c0b49d,2,2,0,0.300062


In [9]:
test.head(10)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04-01,C_ID_0ab67a22ab,3,3,1
1,2017-01-01,C_ID_130fd0cbdd,2,3,0
2,2017-08-01,C_ID_b709037bc5,5,1,1
3,2017-12-01,C_ID_d27d835a9f,2,1,0
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1
5,2017-07-01,C_ID_5814b4f13c,5,1,1
6,2017-10-01,C_ID_a1b3c75277,1,2,0
7,2017-02-01,C_ID_f7cada36d3,3,1,1
8,2017-07-01,C_ID_9d2bc8dfc4,4,3,0
9,2016-03-01,C_ID_6d8dba8475,5,1,1


# Preprocessing

In [3]:
def missing_impute(df):
    for i in df.columns:
        if df[i].dtype == "object":
            df[i] = df[i].fillna("other")
        elif (df[i].dtype == "int64" or df[i].dtype == "float64"):
            df[i] = df[i].fillna(df[i].mean())
        else:
            pass
    return df

In [4]:
def datetime_extract(df, dt_col='first_active_month'):
    df['date'] = df[dt_col].dt.date 
    df['day'] = df[dt_col].dt.day 
    df['dayofweek'] = df[dt_col].dt.dayofweek
    df['dayofyear'] = df[dt_col].dt.dayofyear
    df['days_in_month'] = df[dt_col].dt.days_in_month
    df['daysinmonth'] = df[dt_col].dt.daysinmonth 
    df['month'] = df[dt_col].dt.month
    df['week'] = df[dt_col].dt.week 
    df['weekday'] = df[dt_col].dt.weekday
    df['weekofyear'] = df[dt_col].dt.weekofyear
    df['year'] = train[dt_col].dt.year

    return df

In [5]:
train.isnull().sum()

first_active_month    0
card_id               0
feature_1             0
feature_2             0
feature_3             0
target                0
dtype: int64

In [7]:
# Do extract datetime values
train = datetime_extract(train, dt_col='first_active_month')
test = datetime_extract(test, dt_col='first_active_month')

In [8]:
train.shape, test.shape

((201917, 17), (123623, 16))

**Merge train and test with historical transactions**

In [9]:
excluded_features = ['first_active_month', 'card_id', 'target', 'date']
train_features = [c for c in train.columns if c not in excluded_features]

In [10]:
for f in train_features:
    print(f)

feature_1
feature_2
feature_3
day
dayofweek
dayofyear
days_in_month
daysinmonth
month
week
weekday
weekofyear
year


In [11]:
train.isnull().sum()

first_active_month    0
card_id               0
feature_1             0
feature_2             0
feature_3             0
target                0
date                  0
day                   0
dayofweek             0
dayofyear             0
days_in_month         0
daysinmonth           0
month                 0
week                  0
weekday               0
weekofyear            0
year                  0
dtype: int64

In [20]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,date,day,dayofweek,dayofyear,days_in_month,daysinmonth,month,week,weekday,weekofyear,year
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,2017-06-01,1,3,152,30,30,6,22,3,22,2017
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,2017-01-01,1,6,1,31,31,1,52,6,52,2017
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,2016-08-01,1,0,214,31,31,8,31,0,31,2016
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,2017-09-01,1,4,244,30,30,9,35,4,35,2017
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,2017-11-01,1,2,305,30,30,11,44,2,44,2017


--> Still missing values. So need to fill NA again

In [12]:
for col in train_features:
    for df in [train, test]:
        if df[col].dtype == "float64":
            df[col] = df[col].fillna(df[col].mean())

In [21]:
X = train.drop(['target', 'first_active_month','card_id','date'], axis = 1)
y = train['target']


from sklearn.cross_validation import train_test_split


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model = XGBRegressor(  num_leaves = 31,
        learning_rate = 0.03,
        n_estimators = 1000,
        subsample = .9,
        colsample_bytree = .9,
        random_state = 1)


In [24]:
model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, num_leaves=31, objective='reg:linear',
       random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.9)

# Modeling with LightGBM

In [35]:
oof_preds= model.predict(X_test)

test_preds = model.predict(test.drop([ 'first_active_month','card_id','date'], axis = 1)

SyntaxError: unexpected EOF while parsing (<ipython-input-35-95c9cfb376f5>, line 2)

# Make submission

In [36]:
test_preds

array([-0.55086863, -0.9590353 , -0.83738804, ..., -0.15823603,
       -0.4271899 , -0.0520798 ], dtype=float32)

In [38]:
# Make submission 1
sample_submission['target'] = test_preds
sample_submission.to_csv("xgb1.csv", index=False)
sample_submission.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.550869
1,C_ID_130fd0cbdd,-0.959035
2,C_ID_b709037bc5,-0.837388
3,C_ID_d27d835a9f,-0.06564
4,C_ID_2b5e3df5c2,0.183542


In [39]:
# Make submission 2 - revert from submission 1
sample_submission['target'] = -1*test_preds
sample_submission.to_csv("submission2.csv", index=False)
sample_submission.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.550869
1,C_ID_130fd0cbdd,0.959035
2,C_ID_b709037bc5,0.837388
3,C_ID_d27d835a9f,0.06564
4,C_ID_2b5e3df5c2,-0.183542


In [41]:
# Length of submission
len(test_preds)

123623

In [42]:
# How many positive target values
np.sum(np.array(sub_preds) >= 0, axis=0)


NameError: name 'sub_preds' is not defined

In [43]:
sample_submission

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.550869
1,C_ID_130fd0cbdd,0.959035
2,C_ID_b709037bc5,0.837388
3,C_ID_d27d835a9f,0.065640
4,C_ID_2b5e3df5c2,-0.183542
5,C_ID_5814b4f13c,0.911021
6,C_ID_a1b3c75277,0.443834
7,C_ID_f7cada36d3,0.713410
8,C_ID_9d2bc8dfc4,0.384305
9,C_ID_6d8dba8475,1.034051


In [44]:
abs(-2)

2