In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib notebook


In [10]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv("sample_submission.csv")

In [11]:
train.describe()

Unnamed: 0,feature_1,feature_2,feature_3,target
count,201917.0,201917.0,201917.0,201917.0
mean,3.105311,1.74541,0.565569,-0.393636
std,1.18616,0.751362,0.495683,3.8505
min,1.0,1.0,0.0,-33.219281
25%,2.0,1.0,0.0,-0.88311
50%,3.0,2.0,1.0,-0.023437
75%,4.0,2.0,1.0,0.765453
max,5.0,3.0,1.0,17.965068


In [12]:
train.groupby('feature_1').std()

Unnamed: 0_level_0,feature_2,feature_3,target
feature_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.81511,0.0,3.407106
2,0.679055,0.0,3.756032
3,0.817395,0.0,3.834499
4,0.759881,0.0,3.817954
5,0.489636,0.0,4.134833


In [13]:
plt.figure()
train.target.hist()
plt.yscale('log')
plt.xlabel("target")
plt.title("Target distribution")
plt.show()
#plt.close()


<IPython.core.display.Javascript object>

## Let's check how target is dependent on features

#### Unique values in each feature

In [14]:

train.feature_1.unique(),train.feature_2.unique(),train.feature_3.unique()

(array([5, 4, 2, 1, 3]), array([2, 1, 3]), array([1, 0]))

In [15]:
g = sns.pairplot(train)

<IPython.core.display.Javascript object>

In [16]:
features=test.columns[2:]
for feature in features:
    plt.figure()
    x=[i for i in range(len(train.groupby(feature).mean()))]
    #error=train.groupby(feature).std().target
    plt.bar(x,train.groupby(feature).mean().target, align='center')
    plt.title(feature)
    plt.ylabel(train.columns[-1])
    plt.gca().invert_yaxis()
    plt.show()
#plt.close()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Now let's check with standart deviation

In [17]:

features=test.columns[2:]
for feature in features:
    plt.figure()
    features=train
    x=[i for i in range(len(train.groupby(feature).mean()))]
    error=train.groupby(feature).std().target
    plt.bar(x,train.groupby(feature).mean().target, yerr=error,align='center')
    plt.title(feature)
    plt.ylabel(train.columns[-1])
    plt.gca().invert_yaxis()
    plt.show()
    
#plt.close()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Standart deviation is really high

## Let's do simple linear regression using features to estimate target

In [18]:
from sklearn import  linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Let's define error function

In [19]:
def rmse(y_pred,y_test):
    return np.sqrt(mean_squared_error(y_pred,y_test))

In [20]:
X_train, X_test, y_train, y_test = train_test_split( train.iloc[:,2:-1], train.iloc[:,-1], test_size=0.20, random_state=42)

In [21]:
lgr=linear_model.LinearRegression()
lgr.fit(X_train,y_train)
y_pred_train=lgr.predict(X_train)
y_pred_test=lgr.predict(X_test)
print("RMSE of X_train:")
print(rmse(y_pred_train,y_train))
print("RMSE of X_test:")
print(rmse(y_pred_test,y_test))

RMSE of X_train:
3.843534556150223
RMSE of X_test:
3.8755988760162343


In [22]:
# Coefficients of linear regression
lgr.coef_

array([-0.05187719, -0.04955379,  0.01412528])

### Let's make first benchmark submission

In [23]:
lgr.fit(train.iloc[:,2:-1],train.target)
y_pred_train=lgr.predict(train.iloc[:,2:-1])
y_pred_test=lgr.predict(test.iloc[:,2:])
print("RMSE of train:")
print(rmse(y_pred_train,train.target))



RMSE of train:
3.849965430724074


In [24]:
#Save to csv:
def save_csv(name,y_pred_test):
    ans=pd.DataFrame({'target':y_pred_test,'card_id':test.card_id})
    ans=ans[['card_id','target']]
    ans.to_csv(name,index=False)
    print("file saved: first five rows")
    return ans.head()             

In [25]:
save_csv('trail.csv',y_pred_test)

file saved: first five rows


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.435923
1,C_ID_130fd0cbdd,-0.398107
2,C_ID_b709037bc5,-0.455989
3,C_ID_d27d835a9f,-0.31061
4,C_ID_2b5e3df5c2,-0.455989


In [26]:
train.columns

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'target'],
      dtype='object')

### Let's look at 'first_active_month' and see if there is any correlation. Let's seperate year and month

In [27]:
train_dt=train.copy()
test_dt=test.copy()

train_dt.first_active_month=pd.to_datetime(train.first_active_month)
test_dt.first_active_month=pd.to_datetime(test.first_active_month)

train_dt['year']=train_dt.first_active_month.dt.year
test_dt['year']=test_dt.first_active_month.dt.year

train_dt['month']=train_dt.first_active_month.dt.month
test_dt['month']=test_dt.first_active_month.dt.month

### Plotting month and year versus target

In [28]:
plt.figure()
plt.plot(train_dt.groupby('month').mean().index,train_dt.groupby('month').mean().target)
plt.xlabel("Month")
plt.ylabel("Target")
plt.show()

plt.figure()
plt.plot(train_dt.groupby('year').mean().index,train_dt.groupby('year').mean().target)
plt.xlabel("year")
plt.ylabel("Target")
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Now let's plot with error bars with deviation. It seems variance is very large

In [29]:
plt.figure()
err_month=train_dt.groupby('month').std().target
plt.errorbar(train_dt.groupby('month').mean().index,train_dt.groupby('month').mean().target,yerr=err_month)
plt.xlabel("Month")
plt.ylabel("Target")
plt.show()

plt.figure()
err_year=train_dt.groupby('year').std().target
plt.errorbar(train_dt.groupby('year').mean().index,train_dt.groupby('year').mean().target,yerr=err_year)
plt.xlabel("year")
plt.ylabel("Target")
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Let's try to use month and year for our linear regression. First, we have convert it to one hot encoding which can be done by get_dummies pandas function.

In [30]:
train_month=pd.get_dummies(train_dt.month)
train_year=pd.get_dummies(train_dt.year)

test_month=pd.get_dummies(test_dt.month)
test_year=pd.get_dummies(test_dt.year)

In [31]:
train_one_hot=pd.concat([train_dt,train_month,train_year],axis=1)
test_one_hot=pd.concat([test_dt,test_month,test_year],axis=1)

#drop unnesesary columns for train data
train_one_hot=train_one_hot.drop(['year','month','card_id','first_active_month','target'],axis=1)
#drop unnesesary columns for test data
test_one_hot=test_one_hot.drop(['year','month','card_id','first_active_month'],axis=1)




In [32]:
lgr.fit(train_one_hot,train.target)
y_pred_train=lgr.predict(train_one_hot)
y_pred_test=lgr.predict(test_one_hot)
print("RMSE of train:")
print(rmse(y_pred_train,train.target))
print("RMSE without dates: ")
print("3.849965430724074")

RMSE of train:
3.843518504042536
RMSE without dates: 
3.849965430724074


### Very small improvement in overall performance. Linear regression coefficients seems to be very small 

In [33]:
lgr.coef_

array([-3.86032159e-02,  1.02695695e-03,  6.31321380e-02, -1.65264528e+10,
       -1.65264528e+10, -1.65264528e+10, -1.65264528e+10, -1.65264528e+10,
       -1.65264528e+10, -1.65264528e+10, -1.65264528e+10, -1.65264528e+10,
       -1.65264528e+10, -1.65264528e+10, -1.65264528e+10, -5.56882662e+10,
       -5.56882662e+10, -5.56882662e+10, -5.56882662e+10, -5.56882662e+10,
       -5.56882662e+10, -5.56882662e+10, -5.56882662e+10])

In [34]:
#uncomment to save the file
#save_csv('trail2_date.csv',y_pred_test)

file saved: first five rows


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.359314
1,C_ID_130fd0cbdd,-0.563522
2,C_ID_b709037bc5,-0.270615
3,C_ID_d27d835a9f,-0.169006
4,C_ID_2b5e3df5c2,-0.615555


## Now let's move on historical data

In [35]:
!ls '/Users/nus/Desktop/ELO'

[31mData_Dictionary.xlsx[m[m          [31msample_submission.csv[m[m
[31mhistorical_transactions.csv[m[m   [31mtest.csv[m[m
[31mmerchants.csv[m[m                 [31mtrain.csv[m[m
[31mnew_merchant_transactions.csv[m[m


In [36]:
history=pd.read_csv('/Users/nus/Desktop/ELO/historical_transactions.csv')

In [37]:
history.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


### Number of historical transaction of  each client

In [141]:
hgb = history.groupby("card_id")
hgb = hgb["purchase_amount"].size().reset_index()
hgb.columns = ["card_id", "num_hist_transactions"]

train_h = pd.merge(train.iloc[:,:-1], hgb, on="card_id", how="left")
test_h = pd.merge(test, hgb, on="card_id", how="left")
train_h['target']=train.target

In [57]:
plt.figure()
plt.scatter(train_h.groupby('num_hist_transactions').mean().index,train_h.groupby('num_hist_transactions').mean().target)
plt.ylabel('target')
plt.xlabel('num_hist_transactions')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x12e0632b0>

In [86]:
plt.figure()
train_h.groupby('num_hist_transactions').mean().target.plot()
#plt.xscale('log')
#plt.yscale('log')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1380f6438>

In [75]:
train_h.groupby('num_hist_transactions').mean().index

Int64Index([   2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
            ...
            1546, 1550, 1593, 1634, 1647, 1665, 1786, 2066, 2143, 2912],
           dtype='int64', name='num_hist_transactions', length=1023)

### Let's use bins to visualize. Seaborn boxplot can be used

In [122]:
plt.figure()
bins = [0, 10, 20, 30, 40, 50, 75, 100, 150, 200, 500, 10000]
train_h['binned_num_hist_transactions'] = pd.cut(train_h['num_hist_transactions'], bins)
cnt_srs = train_h.groupby("binned_num_hist_transactions").mean().target

#plt.figure(figsize=(12,8))
sns.boxplot(x="binned_num_hist_transactions", y='target', data=train_h, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('binned_num_hist_transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("binned_num_hist_transactions distribution")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

### Let's look at purchase amount

In [142]:
gdf = history.groupby("card_id")
gdf = gdf["purchase_amount"].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
gdf.columns = ["card_id", "sum_hist_trans", "mean_hist_trans", "std_hist_trans", "min_hist_trans", "max_hist_trans"]
train_h = pd.merge(train_h, gdf, on="card_id", how="left")
test_h = pd.merge(test_h, gdf, on="card_id", how="left")

In [103]:
# make categorical features using percentile
bins = np.percentile(train_h["sum_hist_trans"], range(0,101,10))
train_h['binned_sum_hist_trans'] = pd.cut(train_h['sum_hist_trans'], bins)
#cnt_srs = train_df.groupby("binned_sum_hist_trans")[target_col].mean()

plt.figure()
sns.boxplot(x="binned_sum_hist_trans", y='target', data=train_h, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('binned_sum_hist_trans', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Sum of historical transaction value (Binned) distribution")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

### Mean transaction dependance 

In [124]:
bins = np.percentile(train_h["mean_hist_trans"], range(0,101,10))
train_h['binned_mean_hist_trans'] = pd.cut(train_h['mean_hist_trans'], bins)
#cnt_srs = train_df.groupby("binned_mean_hist_trans")[target_col].mean()

plt.figure()
sns.boxplot(x="binned_mean_hist_trans", y='target', data=train_h, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('Binned Mean Historical Transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Mean of historical transaction value (Binned) distribution")
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

## Let's do the same for new merchant transaction

In [125]:
new_merchant=pd.read_csv('/Users/nus/Desktop/ELO/new_merchant_transactions.csv')

In [306]:
nm = new_merchant.groupby("card_id")
nm = nm["purchase_amount"].size().reset_index()
nm.columns = ["card_id", "num_new_transactions"]

train_h = pd.merge(train_h, nm, on="card_id", how="left")
test_h = pd.merge(test_h, nm, on="card_id", how="left")

### There will be NaN values since not all customers have new purchases. Actually, we can convert NaN values to 0, since NaN means that customer didn't do any new purchases

In [307]:
# NaN values in historical transactions
print("Number of NaN values in historical purchases in train data")
print(train_h.num_hist_transactions.isnull().sum())
print("Number of NaN values in historical purchases in train data")
print(test_h.num_hist_transactions.isnull().sum())

Number of NaN values in historical purchases in train data
0
Number of NaN values in historical purchases in train data
0


In [308]:
# There will be NaN values 
print("Number of NaN values in new purchases in train data")
print(train_h['num_new_transactions'].isnull().sum())

print("Number of NaN values in new purchases in test data")
print(test_h['num_new_transactions'].isnull().sum())



Number of NaN values in new purchases in train data
21931
Number of NaN values in new purchases in test data
13608


### Convert NaN values to zero

In [309]:
def convert(x):
    if np.isnan(x):
        x=0
    return x
train_h.num_new_transactions=train_h.num_new_transactions.apply(convert)


In [310]:
plt.figure()
bins = np.percentile(train_h['num_new_transactions'],range(0,101,20))
train_h['bin_num_new_transactions'] = pd.cut(train_h['num_new_transactions'], bins)
#cnt_srs = train_h.groupby("num_new_transactions").mean().target

#plt.figure(figsize=(12,8))
sns.boxplot(x="bin_num_new_transactions", y='target', data=train_h, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('Binned_num_new_transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("binned_num_new_transactions distribution")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

### Now let's check by amount new transactions

In [313]:
gdf = new_merchant.groupby("card_id")
gdf = gdf["purchase_amount"].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
gdf.columns = ["card_id", "sum_new_trans", "mean_new_trans", "std_new_trans", "min_new_trans", "max_new_trans"]
train_h = pd.merge(train_h, gdf, on="card_id", how="left")
test_h = pd.merge(test_h, gdf, on="card_id", how="left")

### Now let's convert NaN values to zero since NaN values will appear when no new purchases were made. Also, NaN values will appear in std_new_trans (standart deviation). I will change it to zero as well since there is no variance ( I am not sure if it right thing to do)

In [314]:
gdf.columns[1:]

Index(['sum_new_trans', 'mean_new_trans', 'std_new_trans', 'min_new_trans',
       'max_new_trans'],
      dtype='object')

In [315]:
train_h[gdf.columns[1:]].isnull().sum()

sum_new_trans     21931
mean_new_trans    21931
std_new_trans     48718
min_new_trans     21931
max_new_trans     21931
dtype: int64

In [316]:
train_h[gdf.columns[1:]]=train_h[gdf.columns[1:]].fillna(0)
test_h[gdf.columns[1:]]=test_h[gdf.columns[1:]].fillna(0)

In [317]:
train_h[gdf.columns[1:]].isnull().sum()

sum_new_trans     0
mean_new_trans    0
std_new_trans     0
min_new_trans     0
max_new_trans     0
dtype: int64

In [318]:
test_h[gdf.columns[1:]].isnull().sum()

sum_new_trans     0
mean_new_trans    0
std_new_trans     0
min_new_trans     0
max_new_trans     0
dtype: int64

### Let's plot sum_new_transaction dependence 

In [319]:
# make categorical features using percentile
bins = np.percentile(train_h["sum_new_trans"], range(0,101,10))
train_h['binned_sum_new_trans'] = pd.cut(train_h['sum_new_trans'], bins)
#cnt_srs = train_df.groupby("binned_sum_hist_trans")[target_col].mean()
plt.figure()
sns.boxplot(x="binned_sum_new_trans", y='target', data=train_h, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('binned_sum_hist_trans', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Sum of new transaction value (Binned) distribution")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

### Mean transaction dependance 

In [320]:
bins = np.percentile(train_h["mean_new_trans"], range(0,101,10))
train_h['binned_mean_new_trans'] = pd.cut(train_h['mean_new_trans'], bins)
#cnt_srs = train_df.groupby("binned_mean_hist_trans")[target_col].mean()

plt.figure()
sns.boxplot(x="binned_mean_new_trans", y='target', data=train_h, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('Binned Mean new Transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Mean of new transaction value (Binned) distribution")
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### Extract years and month

In [325]:
train_h["first_active_month"]=pd.to_datetime(train_h["first_active_month"])
test_h["first_active_month"]=pd.to_datetime(test_h["first_active_month"])

train_h["year"]=train_h["first_active_month"].dt.year
train_h["month"]=train_h["first_active_month"].dt.month

test_h["year"]=test_h["first_active_month"].dt.year
test_h["month"]=test_h["first_active_month"].dt.month

### Baseline gradient boosted model

In [411]:
train_col=['feature_1', 'feature_2', 'feature_3',
       'num_hist_transactions', 'sum_hist_trans', 'mean_hist_trans',
       'std_hist_trans', 'min_hist_trans', 'max_hist_trans', 'year',
       'month','sum_new_trans', 'mean_new_trans',
       'std_new_trans', 'min_new_trans', 'max_new_trans']

In [412]:
from xgboost.sklearn import XGBRegressor

In [413]:
xgb=XGBRegressor()

In [414]:
xgb.fit(train_h[train_col],train_h.target)
y_pred_xgb=xgb.predict(train_h[train_col])
print("RMSE of train with xgboost")
print(rmse(y_pred_xgb,train.target))

lgr.fit(train_h[train_col],train_h.target)
y_pred_lgr=lgr.predict(train_h[train_col])
print("RMSE of train with linear regression")
print(rmse(y_pred_lgr,train.target))

RMSE of train with xgboost
3.794010454945805
RMSE of train with linear regression
3.8308271993660026


In [420]:
from xgboost import plot_importance
plot_importance(xgb)
plt.tight_layout()
plt.title("Feature importance for xgboost")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x132ca1630>

In [354]:
y_pred=xgb.predict(test_h[train_col])
save_csv("xgboost_hist_new.csv",y_pred)

file saved: first five rows


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.65341
1,C_ID_130fd0cbdd,-0.912396
2,C_ID_b709037bc5,-0.861913
3,C_ID_d27d835a9f,-0.203816
4,C_ID_2b5e3df5c2,-1.332566


### Year and month is actually category let's change it to one hot matrix

In [405]:
train_hd=pd.concat([train_h,train_year,train_month],axis=1)
test_hd=pd.concat([test_h,test_year,test_month],axis=1)

In [406]:
train_col.remove('year')
train_col.remove('month')

In [407]:
train_col_one=train_col+list(train_hd.columns[-20::])

In [408]:
print(train_col_one)

['feature_1', 'feature_2', 'feature_3', 'num_hist_transactions', 'sum_hist_trans', 'mean_hist_trans', 'std_hist_trans', 'min_hist_trans', 'max_hist_trans', 'sum_new_trans', 'mean_new_trans', 'std_new_trans', 'min_new_trans', 'max_new_trans', 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [409]:
xgb.fit(train_hd[train_col_one],train_h.target)
y_pred_xgb=xgb.predict(train_hd[train_col_one])
print("RMSE of train with xgboost")
print(rmse(y_pred_xgb,train.target))

lgr.fit(train_hd[train_col_one],train_hd.target)
y_pred_lgr=lgr.predict(train_hd[train_col_one])
print("RMSE of train with linear regression")
print(rmse(y_pred_lgr,train.target))

RMSE of train with xgboost
3.795880238800937
RMSE of train with linear regression
3.8293405227573887
