In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import datetime
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

In [None]:
features = ['card_id','purchase_date','subsector_id','merchant_id',
            'merchant_category_id','state_id','city_id','purchase_amount']
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
df_hist_trans = pd.read_csv('../input/historical_transactions.csv',usecols = features)
df_new_merchant_trans = pd.read_csv('../input/new_merchant_transactions.csv', usecols = features)

In [None]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    del df['purchase_date']

In [None]:
def get_new_columns(name,aggs):
    #for for 写法 nice
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [None]:
i = 0
import gc
import time
for df in [df_new_merchant_trans,df_hist_trans]:
    aggs = {}
    for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id',
                'state_id','city_id']:
        aggs[col] = ['nunique']
    aggs['purchase_amount'] = ['sum','mean']
    if i == 0:
        prefix = 'new_hist'
    else:
        prefix = 'hist'
    new_columns = get_new_columns(prefix,aggs)
    i += 1
    df_hist_trans_group = df.groupby('card_id').agg(aggs)
    df_hist_trans_group.columns = new_columns
    df_hist_trans_group.reset_index(drop=False,inplace=True)
    df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
    df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
    del df_hist_trans_group
    gc.collect()
    time.sleep(5)

In [None]:
for df in [df_train,df_test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime(2018,2,1) - df['first_active_month']).dt.days

In [None]:
exclude_features = []
exclude_features += ['card_id', 'first_active_month','target','outliers']
df_train_columns = [c for c in df_train.columns if c not in exclude_features ]
target = df_train['target']

In [None]:
len(df_train.columns)

In [None]:
df_train.head()

In [None]:
df_train.fillna(0,inplace = True)
df_test.fillna(0,inplace = True)
df_test.iloc[1000]

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(df_train[df_train_columns])
X_test_minmax = min_max_scaler.fit_transform(df_test[df_train_columns])

In [None]:
train = pd.DataFrame(X_train_minmax,columns = df_train_columns)
train['card_id'] = df_train.card_id
train['target'] = df_train.target
test = pd.DataFrame(X_test_minmax,columns = df_train_columns)
test['card_id'] = df_test.card_id
train.head()

In [None]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=500,n_jobs = 4)
knn.fit(train[df_train_columns]) 

In [None]:
train_neigh = knn.kneighbors(train[df_train_columns], return_distance=False) 
test_neigh = knn.kneighbors(test[df_train_columns], return_distance=False) 
test_neigh[1000]

In [None]:
train.iloc[0].target

In [None]:
train_target = np.zeros(len(df_train))

In [None]:
for i in range(len(df_train)):
    cur = train_neigh[i]
    target_mean = 0
    for index in cur:
        target_mean += train.iloc[index].target
    train_target[i] = target_mean / len(cur)
    if i % 1000 == 0:
        print(i)
df_train_target = pd.DataFrame(train_target)
df_train_target['card_id'] = df_train.card_id

In [None]:
test_target = np.zeros(len(df_test))
for i in range(len(df_test)):
    cur = test_neigh[i]
    target_mean = 0
    for index in cur:
        target_mean += train.iloc[index].target
    test_target[i] = target_mean / len(cur)
    if i % 1000 == 0:
        print(i)
df_test_target = pd.DataFrame(test_target)
df_test_target['card_id'] = df_test.card_id

In [None]:
df_train_50_neastneighbor.columns = ['50nearstneighbor','card_id']
df_test_50_neastneighbor.columns = ['50nearstneighbor','card_id']
df_train_target.to_pickle('train_target_50nearst_neighbor.pickle')
df_test_target.to_pickle('test_target_50nearst_neighbor.pickle')