In [8]:
import pandas as pd
import numpy as np
import gc
import joblib
import datetime
import itertools
import os.path as path
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
%matplotlib inline

font_dirs = ['/home/workspace/user-workspace/font']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    
plt.rcParams['font.family'] = 'NanumGothic'

In [10]:
train_path = '/home/workspace/user-workspace/slim_train.parquet'
test_path ='/home/workspace/user-workspace/slim_test.parquet'
encoder = '/home/workspace/user-workspace/cat_encoder.json'
decoder = '/home/workspace/user-workspace/inverse_cat_encoder.json'
data_dir = '/home/workspace/user-workspace/junheon/data/task150/'

In [11]:
negative_ratio = 1
bagging_size = 5

In [12]:
train_df = pd.read_parquet(train_path).reset_index().rename(columns={"index": "id"})

In [13]:
test_df = pd.read_parquet(test_path).reset_index().rename(columns={"index": "id"})

In [14]:
train_df.head()

Unnamed: 0,id,TRD_NO,REQ_DD,CP_CD,CP_NM,GODS_NM,PAYR_SEQ,MPHN_NO,COMMC_CLF,AC_PAY_AMT,...,MAX_NPAY_CNT_24M,TRD_CNT_6M,REAL_TRD_CNT_6M,NIGHT_TRD_RT_6M,AVG_AMT_6M,MAX_LMT_3M_RT,NPAY_CNT_24M,NPAY_CNT_12MNTS,NPAY_AMT_60M,target
0,0,9iSJiZ+6F/ojR81Swb0CU5oBNWIuJuSsmXsb7aPoWro=,20190701,270,308,394324,945077,940112,0,49900,...,0,5,1,0.6429,111716.0,0.499,1,7,0.0,0
1,1,UStyI9p3TkoNswhaARv+Dzznl7NW4o49XlSEv/jy3/U=,20190701,837,173,735708,1850868,919223,1,14300,...,0,5,5,0.4444,30776.0,0.21767,0,1,0.0,0
2,2,PgOykMEKX3so4zIpsNcU+zNt+Nj4VQdwjDB+NlVIJN4=,20190702,785,539,499519,415916,71968,2,440000,...,0,6,0,0.1111,177392.0,1.0,0,2,0.0,0
3,3,9ngELbCK8cqbPY53oe0eUY+tPvTR/OU7KIMg0pDdh4w=,20190701,1198,7,272349,2477403,568377,1,9907,...,0,6,6,0.0645,123318.0,0.52973,0,0,0.0,0
4,4,sT/zlLzq7AK9QkTSH51L5+LSNY+zkfwrr7PlR4EOMtI=,20190702,1198,7,674245,3452051,190416,0,47600,...,0,3,3,0.1429,90333.0,0.331,1,5,0.0,0


# Create Indices

In [9]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-09 09:31:19.311746


In [10]:
positive = train_df[train_df['target']==1]
negative = train_df[train_df['target']==0]

In [11]:
print(len(positive))
print(len(negative))

274713
7591835


In [12]:
for seed in range(bagging_size):
    negative_sample = negative.sample(n=(len(positive)*negative_ratio), random_state=seed)
    id_list = positive.id.tolist() + negative_sample.id.tolist()
    joblib.dump(id_list, f"{data_dir}indices_{seed}.pkl")

# Feture Engineering

## Basic features

In [13]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-07 14:17:35.867743


In [14]:
FILE_NAME = "basic_feature"

In [15]:
basic_features = {
    "id": "uint32",
    "AC_PAY_AMT": "uint32",
    "AGE": "uint8",
    "SMS_RE_SND_CNT": "uint8",
    "ACUM_RCPT_AMT": "int32",
    "MAX_NPAY_CNT_24M": "uint8",
    "TRD_CNT_6M": "uint8",
    "REAL_TRD_CNT_6M": "uint8",
    "NPAY_CNT_24M": "uint8",
    "NPAY_CNT_12MNTS": "uint8",
    "MM_LMT_AMT": "float32",
    "REMD_LMT_AMT": "float32",
    "NPAY_AMT_24M": "float32",
    "NIGHT_TRD_RT_6M": "float32",
    "AVG_AMT_6M": "float32",
    "MAX_LMT_3M_RT": "float32",
    "NPAY_AMT_60M": "float32"
}

In [16]:
category_features = [
    "COMMC_CLF", "NPAY_YN", "PAY_MTHD_CD", "ARS_AUTHTI_YN", "GNDR", "FOREI_YN",  "AUTHTI_CLF_FLG", 
    "SVC_CLF_NM", "CP_M_CLF_NM", "CP_S_CLF_NM" 
]

In [17]:
df_train = train_df[list(basic_features.keys()) + category_features + ["target"]].astype(basic_features)
df_test = test_df[list(basic_features.keys()) + category_features].astype(basic_features)

In [18]:
df_train['NPAY_YN'] = [-99 if x==2 else x for x in df_train['NPAY_YN']]
df_test['NPAY_YN'] = [-99 if x==2 else x for x in df_test['NPAY_YN']]

In [19]:
df_train['PAY_MTHD_CD'] = [-99 if x==0 else x for x in df_train['PAY_MTHD_CD']]
df_test['PAY_MTHD_CD'] = [-99 if x==0 else x for x in df_test['PAY_MTHD_CD']]

In [20]:
df_train['MM_LMT_AMT'] = [1000000 if np.isnan(x) else x for x in df_train['MM_LMT_AMT']]
df_test['MM_LMT_AMT'] = [1000000 if np.isnan(x) else x for x in df_test['MM_LMT_AMT']]

In [21]:
df_train['REMD_LMT_AMT'] = [1000000 if np.isnan(x) else x for x in df_train['REMD_LMT_AMT']]
df_test['REMD_LMT_AMT'] = [1000000 if np.isnan(x) else x for x in df_test['REMD_LMT_AMT']]

In [22]:
df_train['ARS_AUTHTI_YN'] = [-99 if x==2 else x for x in df_train['ARS_AUTHTI_YN']]
df_test['ARS_AUTHTI_YN'] = [-99 if x==2 else x for x in df_test['ARS_AUTHTI_YN']]

In [23]:
df_train['CP_M_CLF_NM'] = [-99 if x==5 else x for x in df_train['CP_M_CLF_NM']]
df_test['CP_M_CLF_NM'] = [-99 if x==5 else x for x in df_test['CP_M_CLF_NM']]

In [24]:
df_train['CP_S_CLF_NM'] = [-99 if x==34 else x for x in df_train['CP_S_CLF_NM']]
df_test['CP_S_CLF_NM'] = [-99 if x==34 else x for x in df_test['CP_S_CLF_NM']]

In [25]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [26]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [27]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-07 14:18:49.328103


## High Cardinality Basic features

In [28]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-07 14:18:49.337561


In [29]:
FILE_NAME = "high_cardi_basic_feature"

In [30]:
basic_features = ["id", "CP_CD", "GODS_NM", "PAYR_SEQ", "PAYR_IP"]

In [31]:
df_train = train_df[basic_features]
df_test = test_df[basic_features]

In [32]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [33]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [34]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-07 14:18:54.871364


## month, day (hour은 없음)

In [35]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-07 14:18:54.880416


In [36]:
FILE_NAME = "month_day"

In [37]:
df_train = train_df[['id', 'REQ_DD']].astype({"REQ_DD": "str"})
df_test = test_df[['id', 'REQ_DD']].astype({"REQ_DD": "str"})

In [38]:
df_train['month'] = df_train.REQ_DD.astype("str").str[4:6].astype("uint8")
df_train['day'] = df_train.REQ_DD.astype("str").str[6:8].astype("uint8")
df_test['month'] = df_test.REQ_DD.astype("str").str[4:6].astype("uint8")
df_test['day'] = df_test.REQ_DD.astype("str").str[6:8].astype("uint8")

In [39]:
df_train = df_train[['id', 'month', 'day']]
df_test = df_test[['id', 'month', 'day']]

In [40]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [41]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [42]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-07 14:19:21.921631


## 거래금액 (AC_PAY_AMT) 49000원, 11000원

49000원: 롤 rp 충전 최고 금액

11000원: 아프리카 별풍선 100

In [43]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-07 14:19:21.926522


In [44]:
FILE_NAME = "pay_amt"

In [45]:
df_train = train_df[['id', 'AC_PAY_AMT']]
df_test = test_df[['id', 'AC_PAY_AMT']]

In [46]:
df_train["is_49900_PAY_AMT"] = (df_train['AC_PAY_AMT']==49900)
df_train["is_11000_PAY_AMT"] = (df_train['AC_PAY_AMT']==11000)
df_train["is_11000s_PAY_AMT"] = (df_train['AC_PAY_AMT']%11000==0)
df_test["is_49900_PAY_AMT"] = (df_test['AC_PAY_AMT']==49900)
df_test["is_11000_PAY_AMT"] = (df_test['AC_PAY_AMT']==11000)
df_test["is_11000s_PAY_AMT"] = (df_test['AC_PAY_AMT']%11000==0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_49900_PAY_AMT"] = (df_train['AC_PAY_AMT']==49900)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_11000_PAY_AMT"] = (df_train['AC_PAY_AMT']==11000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_11000s_PAY_AMT"] = (df_train['AC_PAY_AMT']%11000==0)
A value is tr

In [47]:
df_train = df_train[['id', 'is_49900_PAY_AMT', 'is_11000_PAY_AMT', 'is_11000s_PAY_AMT']]
df_test = df_test[['id', 'is_49900_PAY_AMT', 'is_11000_PAY_AMT', 'is_11000s_PAY_AMT']]

In [48]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [49]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [50]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-07 14:19:26.036267


In [28]:
pd.read_parquet(f"{data_dir}month_day_0.parquet")

Unnamed: 0,id,month,day
7,7,7,2
62,62,7,2
64,64,7,1
65,65,7,1
66,66,7,1
...,...,...,...
3849511,3849511,8,29
7782626,7782626,10,31
6626310,6626310,10,12
820200,820200,7,10


## PCA Features
"CP_CD", "GODS_NM", "PAYR_SEQ", "MPHN_NO", "PAYR_IP"

LDA는 리소스 부족으로 

### PCA 5

In [62]:
column_prefix = "PCA5"

In [63]:
FILE_NAME = "PCA5"

In [64]:
def get_column_pairs(feature_list):
    return [(col1, col2) for col1, col2 in itertools.product(feature_list, repeat=2) if col1!=col2]

In [65]:
def get_documents(col1, col2):
    document_path = f"{data_dir}word_list_{col1}_{col2}.pkl"
    if not path.isfile(document_path):
        df = train_df[[col1, col2]]
        document_list_size = df[col1].max() + 1
        documents = [[] for _ in range(document_list_size)]
        for document, word in zip(df[col1], df[col2]):
            documents[document].append(word)
        documents = [' '.join(map(str, words)) for words in documents]
        joblib.dump(documents, document_path)
        return documents
    with open(document_path, 'rb') as f:
        documents = joblib.load(f)
    return documents

In [66]:
def get_document_term_matrix(col1, col2):
    documents = get_documents(col1, col2)
    vectorizer = TfidfVectorizer(min_df=2, dtype=np.float32)
    return vectorizer.fit_transform(documents)

In [72]:
def get_latent_vector(col1, col2, n_topics):
    latent_path = f"{data_dir}{column_prefix}_{col1}_{col2}.pkl"
    if not path.isfile(latent_path):
        document_term_matrix = get_document_term_matrix(col1, col2)
        decomposer = TruncatedSVD(n_components=n_topics, random_state=77)
        latent_vector = decomposer.fit_transform(document_term_matrix)
        joblib.dump(latent_vector, latent_path)
        return latent_vector
    with open(latent_path, 'rb') as f:
        latent_vector = joblib.load(f)
    return latent_vector

In [74]:
def get_latent_df(column_pairs, df, n_topics):
    latent_vectors = []
    for pair in column_pairs:
        latent_vector = get_latent_vector(pair[0], pair[1], n_topics)
        latent_vectors.append(latent_vector.astype(np.float32))
    n_columns = n_topics * len(column_pairs)
    features = np.zeros(shape=(len(df), n_columns), dtype=np.float32)
    columns = []
    
    for i, pair in enumerate(column_pairs):
        offset = i * n_topics
        for j in range(n_topics):
            columns.append(f"{column_prefix}-{pair[0]}-{pair[1]}-{j}")
        
        for j, value in enumerate(df[pair[0]]):
            features[j, offset:offset + n_topics] = latent_vectors[i][value]
            
    result_df = pd.DataFrame(data=features, columns=columns, index=df.id)
    
    return result_df    

In [69]:
basic_features = ["id", "CP_CD", "GODS_NM", "PAYR_SEQ"]

In [70]:
column_pair = get_column_pairs(basic_features[1:])

In [75]:
latent_df = get_latent_df(column_pair, train_df[basic_features], 5)
latent_df.to_parquet(f"{data_dir}{FILE_NAME}.parquet")

In [76]:
df_train = latent_df.reset_index()

In [77]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [78]:
test_df = test_df[['id', 'CP_CD', 'GODS_NM', 'PAYR_SEQ']]
for pair in column_pair:
    columns = [f"{column_prefix}-{pair[0]}-{pair[1]}-{i}" for i in range(5)]
    temp_df = latent_df[columns].reset_index()
    temp_df[pair[0]] = train_df[pair[0]]
    temp_df[pair[1]] = train_df[pair[1]]
    temp_df = temp_df.drop_duplicates(subset=[pair[0], pair[1]])
    temp_df = temp_df.drop(columns=['id'])
    test_df = test_df.merge(temp_df, on=[pair[0], pair[1]], how='left')

In [79]:
test_df.drop(columns=['CP_CD', 'GODS_NM', 'PAYR_SEQ']).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [80]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-09 11:18:02.586315


## NMF Features
"CP_CD", "GODS_NM", "PAYR_SEQ", "MPHN_NO", "PAYR_IP"

LDA는 리소스 부족으로 

### NMF 5

In [25]:
FILE_NAME = "NMF5"

In [23]:
column_prefix = "NMF5"

In [8]:
def get_column_pairs(feature_list):
    return [(col1, col2) for col1, col2 in itertools.product(feature_list, repeat=2) if col1!=col2]

In [9]:
def get_documents(col1, col2):
    document_path = f"{data_dir}word_list_{col1}_{col2}.pkl"
    if not path.isfile(document_path):
        df = train_df[[col1, col2]]
        document_list_size = df[col1].max() + 1
        documents = [[] for _ in range(document_list_size)]
        for document, word in zip(df[col1], df[col2]):
            documents[document].append(word)
        documents = [' '.join(map(str, words)) for words in documents]
        joblib.dump(documents, document_path)
        return documents
    with open(document_path, 'rb') as f:
        documents = joblib.load(f)
    return documents

In [10]:
def get_document_term_matrix(col1, col2):
    documents = get_documents(col1, col2)
    vectorizer = TfidfVectorizer(min_df=2, dtype=np.float32)
    return vectorizer.fit_transform(documents)

In [11]:
def get_latent_vector(col1, col2, n_topics):
    latent_path = f"{data_dir}{column_prefix}_{col1}_{col2}.pkl"
    if not path.isfile(latent_path):
        document_term_matrix = get_document_term_matrix(col1, col2)
        decomposer = NMF(n_components=n_topics, random_state=77)
        latent_vector = decomposer.fit_transform(document_term_matrix)
        joblib.dump(latent_vector, latent_path)
        return latent_vector
    with open(latent_path, 'rb') as f:
        latent_vector = joblib.load(f)
    return latent_vector

In [16]:
def get_latent_df(column_pairs, df, n_topics):
    latent_vectors = []
    for pair in column_pairs:
        latent_vector = get_latent_vector(pair[0], pair[1], n_topics)
        latent_vectors.append(latent_vector.astype(np.float32))
    n_columns = n_topics * len(column_pairs)
    features = np.zeros(shape=(len(df), n_columns), dtype=np.float32)
    columns = []
    
    for i, pair in enumerate(column_pairs):
        offset = i * n_topics
        for j in range(n_topics):
            columns.append(f"{column_prefix}-{pair[0]}-{pair[1]}-{j}")
        
        for j, value in enumerate(df[pair[0]]):
            features[j, offset:offset + n_topics] = latent_vectors[i][value]
            
    result_df = pd.DataFrame(data=features, columns=columns, index=df.id)
    
    return result_df    

In [30]:
basic_features = ["id", "CP_CD", "GODS_NM", "PAYR_SEQ"]

In [31]:
column_pair = get_column_pairs(basic_features[1:])

In [19]:
latent_df = get_latent_df(column_pair, train_df[basic_features], 5)
latent_df.to_parquet(f"{data_dir}{FILE_NAME}.parquet")

In [41]:
df_train = latent_df.reset_index()

In [43]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [57]:
test_df = test_df[['id', 'CP_CD', 'GODS_NM', 'PAYR_SEQ']]
for pair in column_pair:
    columns = [f"{column_prefix}-{pair[0]}-{pair[1]}-{i}" for i in range(5)]
    temp_df = latent_df[columns].reset_index()
    temp_df[pair[0]] = train_df[pair[0]]
    temp_df[pair[1]] = train_df[pair[1]]
    temp_df = temp_df.drop_duplicates(subset=[pair[0], pair[1]])
    temp_df = temp_df.drop(columns=['id'])
    test_df = test_df.merge(temp_df, on=[pair[0], pair[1]], how='left')

In [60]:
test_df.drop(columns=['CP_CD', 'GODS_NM', 'PAYR_SEQ']).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [27]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-07 14:18:49.328103


## Age 구간 

categorical feature

In [7]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-09 11:20:30.512780


In [8]:
FILE_NAME = "age_bin"

In [9]:
df_train = train_df[['id', 'AGE']]
df_test = test_df[['id', 'AGE']]

In [10]:
df_train['AGE_bin'] = [0 if x<25 else x for x in df_train['AGE']]
df_train['AGE_bin'] = [1 if (x>=25 & x<30) else x for x in df_train['AGE']]
df_train['AGE_bin'] = [2 if (x>=30 & x<35) else x for x in df_train['AGE']]
df_train['AGE_bin'] = [3 if (x>=35 & x<40) else x for x in df_train['AGE']]
df_train['AGE_bin'] = [4 if (x>=40 & x<50) else x for x in df_train['AGE']]
df_train['AGE_bin'] = [5 if (x>=50 & x<60) else x for x in df_train['AGE']]
df_train['AGE_bin'] = [6 if x>=60 else x for x in df_train['AGE']]

df_test['AGE_bin'] = [0 if x<25 else x for x in df_test['AGE']]
df_test['AGE_bin'] = [1 if (x>=25 & x<30) else x for x in df_test['AGE']]
df_test['AGE_bin'] = [2 if (x>=30 & x<35) else x for x in df_test['AGE']]
df_test['AGE_bin'] = [3 if (x>=35 & x<40) else x for x in df_test['AGE']]
df_test['AGE_bin'] = [4 if (x>=40 & x<50) else x for x in df_test['AGE']]
df_test['AGE_bin'] = [5 if (x>=50 & x<60) else x for x in df_test['AGE']]
df_test['AGE_bin'] = [6 if x>=60 else x for x in df_test['AGE']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['AGE_bin'] = [0 if x<25 else x for x in df_train['AGE']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['AGE_bin'] = [1 if (x>=25 & x<30) else x for x in df_train['AGE']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['AGE_bin'] = [2 if (x>=30 & x<35) else x for x in df_t

In [None]:
df_train = df_train[['id', 'AGE_bin']]
df_test = df_test[['id', 'AGE_bin']]

In [11]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [12]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [13]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-09 11:21:16.070979


## phone count

In [18]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-09 11:21:25.117400


In [19]:
FILE_NAME = "phone_cnt_wrt_seq"

In [20]:
df_train = train_df[['id', 'MPHN_NO', 'PAYR_SEQ']]
df_test = test_df[['id', 'PAYR_SEQ']]

In [21]:
count_df = df_train[['MPHN_NO', 'PAYR_SEQ']].groupby(by=["PAYR_SEQ"]).agg(['nunique']).reset_index()
count_df.columns = ["PAYR_SEQ", "phone_cnt_wrt_seq"]
df_train = df_train.merge(count_df, on=['PAYR_SEQ'], how="left")[['id', 'phone_cnt_wrt_seq']]
df_test = df_test.merge(count_df, on=['PAYR_SEQ'], how="left")[['id', 'phone_cnt_wrt_seq']]

In [22]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [23]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [24]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-09 11:21:42.805007


## seq count

In [25]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-09 11:21:53.409736


In [26]:
FILE_NAME = "seq_count"

In [27]:
df_train = train_df[['id', 'PAYR_SEQ']]
df_test = test_df[['id', 'PAYR_SEQ']]

In [28]:
count_df = df_train[['id', 'PAYR_SEQ']].groupby(by=["PAYR_SEQ"]).count().reset_index()
count_df.columns = ["PAYR_SEQ", "seq_count"]
df_train = df_train.merge(count_df, on=['PAYR_SEQ'], how="left")[['id', 'seq_count']]
df_test = df_test.merge(count_df, on=['PAYR_SEQ'], how="left")[['id', 'seq_count']]

In [29]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [30]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [31]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-09 11:22:06.456821


# is_targeted

In [15]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-09 11:50:09.976364


In [16]:
FILE_NAME = "is_targeted"

In [17]:
df_train = train_df[['id', 'MPHN_NO', 'target']]
df_train['cumsum'] = df_train[['MPHN_NO', 'target']].groupby(['MPHN_NO'])['target'].cumsum()
df_train['is_targeted'] = [0 if x<=1 else 1 for x in df_train['cumsum']]

is_targeted = df_train[['MPHN_NO', 'is_targeted']].drop_duplicates(subset=['MPHN_NO'], keep='last')
df_test = test_df[['id', 'MPHN_NO']].merge(is_targeted, on=['MPHN_NO'], how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['cumsum'] = df_train[['MPHN_NO', 'target']].groupby(['MPHN_NO'])['target'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['is_targeted'] = [0 if x<=1 else 1 for x in df_train['cumsum']]


In [18]:
df_train = df_train[['id', 'is_targeted']]
df_test = df_test[['id', 'is_targeted']]

In [19]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [20]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [21]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-09 11:50:30.315963
