In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing ,metrics
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
import xgboost as xgb
import lightgbm.sklearn as lgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils.testing import all_estimators
import sys
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from scipy import sparse as sp
# Any results you write to the current directory are saved as output.
from scipy.special import lambertw
from scipy.stats import kurtosis, norm, rankdata, boxcox, zscore
from scipy.optimize import fmin  # TODO: Explore efficacy of other opt. methods
import gc
import psutil
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
class wrapped_logger():
    def __init__(self,log_file_name = 'debug.log'):
        from logging import getLogger, StreamHandler,DEBUG,Formatter, FileHandler
        import psutil 
        import time
        self.time=time.time
        self.logger = getLogger(__name__)
        self.START_TIME=self.time()
        self.psutil=psutil

        handler_format = Formatter(f'%(asctime)s -  %(name)s - %(levelname)s - %(message)s')

        handler = StreamHandler()
        handler.setFormatter(handler_format)
        handler.setLevel(DEBUG)
        file_handler = FileHandler(log_file_name)
        file_handler.setFormatter(handler_format)
        file_handler.setLevel(DEBUG)
        self.logger.setLevel(DEBUG)
        self.logger.addHandler(handler)
        self.logger.addHandler(file_handler)
    def info(self,message):
        self.logger.info(self.__message(message))
    def debug(self,message):
        self.logger.debug(self.__message(message))
    def warning(self,message):
        self.logger.warning(self.__message(message))
        
    def __message(self,message):
        return f'{self.time()-self.START_TIME}s - mem usage:{self.psutil.virtual_memory().used/1024/1024} - {message}'

In [None]:
logger=wrapped_logger()
logger.info('start logging')


In [None]:
FRAC = 0.01
MY_DATASET='../input/avito-demand-prediction-challenge-private-dataset'
DATASET='../input/avito-demand-prediction'
def read_csv(frac,fanc):
    data = fanc
    #data=data.iloc[:10000,:]
    #if FRAC != 1: data = data.sample(frac = FRAC,random_state=0).reset_index(drop=True)
    return data

In [None]:
train = read_csv(FRAC,pd.read_csv(f'{DATASET}/train.csv', parse_dates=["activation_date"]))
test = read_csv(FRAC,pd.read_csv(f'{DATASET}/test.csv', parse_dates=["activation_date"]))

In [None]:
train.nunique()

In [None]:
train.head()

In [None]:
def fill_na(col_names,train,test,fill_what):
    for col_name in col_names:
        train[col_name].fillna(fill_what, inplace=True)
        test[col_name].fillna(fill_what, inplace=True)
    return train, test

In [None]:
def get_len(col_names):
    for col_name in col_names:
        train[f'{col_name}_len'] = train[col_name].apply(lambda x: len(x.split()))
        test[f'{col_name}_len'] = test[col_name].apply(lambda x: len(x.split()))
    return train, test

In [None]:
train.info()

In [None]:
def get_zscore(train,test,col_names):
    full_data = pd.concat([train, test], axis = 0)
    for col in col_names:
        full_data[col] =((full_data[col]-full_data[col].mean())/full_data[col].std())
        train[col]=full_data[col][:train.shape[0]]
        test[col] =full_data[col][train.shape[0]:]
        
    del full_data
    gc.collect()
    return train, test
        

In [None]:
col_names_fillna = [
    'description',
    'title',
    "param_1", 
    "param_2", 
    "param_3",
    'city',
    "region",
    "parent_category_name", 
    "category_name", 
    "user_type",
    'user_id',
    'activation_date'
]
train,test=fill_na(col_names_fillna,train,test,'NaN')

train['image_bool'] = train['image'].apply(lambda x: 1 if str == type(x) else 0)
test['image_bool'] = test['image'].apply(lambda x: 1 if str == type(x) else 0)

col_names_len = [
    'description',
    'title',
    "param_1", 
    "param_2", 
    "param_3"
]

train,test=get_len(col_names_len)
col_names_len = [
    'description_len',
    'title_len',
    "param_1_len", 
    "param_2_len", 
    "param_3_len",
    'price',
    'item_seq_number'
]
train,test=get_zscore(train,test,col_names_len)
y_train =train["deal_probability"].ravel()

test_id = test["item_id"].values

cols_to_drop = ["item_id", 'image']
train = train.drop(cols_to_drop + ["deal_probability"], axis = 1)
test = test.drop(cols_to_drop, axis = 1)


In [None]:
def get_svd(fit_data, transform_data, n_comp, col_name):
    #get svd
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(fit_data)
    df_svd = pd.DataFrame(svd_obj.transform(transform_data))
    df_svd.columns = ['svd_' + col_name + '_'+str(i+1) for i in range(n_comp)]
    del svd_obj
    gc.collect()
    return df_svd

def get_tfidf_svd(train, test, col_name):
    #get tfidf
    tfidf_vec = TfidfVectorizer(ngram_range=(1,1), max_features=100000)
    full_tfidf = tfidf_vec.fit_transform(pd.concat([train[col_name], test[col_name]], axis=0))
    train_tfidf = tfidf_vec.transform(train[col_name])
    test_tfidf = tfidf_vec.transform(test[col_name])
    
    #get svd
    n_comp = 10
    train_svd = get_svd(full_tfidf, train_tfidf, n_comp,col_name)
    test_svd = get_svd(full_tfidf, test_tfidf, n_comp,col_name)

    #clean
    del full_tfidf, train_tfidf, test_tfidf, tfidf_vec
    gc.collect()
    
    return train_svd, test_svd

def engineering_tfidf(how,col_name):
    if how == 'read': 
        logger.info(f'Read {col_name}')
        train_svd = read_csv(FRAC,pd.read_csv(f'{MY_DATASET}/train_{col_name}_svd.csv'))
        test_svd = read_csv(FRAC,pd.read_csv(f'{MY_DATASET}/test_{col_name}_svd.csv'))
    elif how == 'write':
        train_svd, test_svd = get_tfidf_svd(train = train, test = test, col_name = col_name)
        train_svd.to_csv(f'train_{col_name}_svd.csv',index=False)
        test_svd.to_csv(f'test_{col_name}_svd.csv',index=False)
    else:
        train_svd, test_svd = get_tfidf_svd(train = train, test = test, col_name = col_name)
    return train_svd, test_svd


In [None]:
train_title_svd, test_title_svd = engineering_tfidf('read','title')
train_description_svd, test_description_svd = engineering_tfidf('read','description')
train = pd.concat([train, train_title_svd,train_description_svd], axis=1)
test = pd.concat([test, test_title_svd,test_description_svd], axis=1)
del train_title_svd, test_title_svd, train_description_svd, test_description_svd
gc.collect()
cols_to_drop = ['description', 'title']
train.drop(cols_to_drop, axis = 1, inplace=True)
test.drop(cols_to_drop, axis = 1, inplace=True)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
def num_to_cat(col_names,train,test,n_comp=-1):
    
    for col_name in col_names:
        full_col = pd.concat([train[col_name], test[col_name]])
        n_unique=full_col.nunique(dropna=False)
        le = preprocessing.LabelEncoder()
        oe = preprocessing.OneHotEncoder()
        le.fit(full_col.values.astype('str'))

        train_le = le.transform(train[col_name].values.astype('str')).reshape(-1,1)
        test_le = le.transform(test[col_name].values.astype('str')).reshape(-1,1)
        full_le = np.append(train_le , test_le).reshape(-1,1)
        oe.fit(full_le)
        train_oe = oe.transform(train_le)
        test_oe = oe.transform(test_le)
        
        full_oe =sp.vstack((train_oe  ,test_oe))
        if n_unique <= n_comp or n_comp == -1: 
            col_names_svd=[f'le_{col_name}_{i}' for i in range(1, n_unique+1) ]
            train_svd=pd.SparseDataFrame(train_oe, columns=col_names_svd)
            test_svd=pd.SparseDataFrame(test_oe, columns=col_names_svd)
        else:
            train_svd=get_svd(full_oe, train_oe, n_comp, col_name)
            test_svd=get_svd(full_oe, test_oe, n_comp, col_name)
        train = pd.concat([train,train_svd], axis=1)
        test = pd.concat([test, test_svd], axis=1)
        print(train.head())
        train = train.drop(col_name, axis = 1)
        test = test.drop(col_name, axis = 1)

    return train,test

In [None]:
def num_to_label(col_names,train,test):
    for col in col_names:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
        train[col] = lbl.transform(list(train[col].values.astype('str')))
        test[col] = lbl.transform(list(test[col].values.astype('str')))
    return train,test

In [None]:

train["activation_weekday"] = train["activation_date"].dt.weekday
test["activation_weekday"] = test["activation_date"].dt.weekday
train["activation_month"] = train["activation_date"].dt.month
test["activation_month"] = test["activation_date"].dt.month
cols_to_drop = [
    "activation_date",
    'user_id'
]

train = train.drop(cols_to_drop, axis = 1)
test = test.drop(cols_to_drop, axis = 1)
col_vars_cat = [
    #'user_id',
    "city",
    "region", 
    "parent_category_name", 
    "category_name", 
    "user_type", 
    "param_1", 
    "param_2", 
    "param_3",
    'image_top_1',
    'activation_weekday',
    'activation_month'
]
col_vars_lb=[
    'user_id'
]
train,test=num_to_cat(col_vars_cat,train,test)
#train,test=num_to_label(col_vars_lb,train,test)

train.info()
train = train.to_sparse()
test=test.to_sparse()


In [None]:
train = train.to_coo().tocsr()
test = test.to_coo().tocsr()
X=sp.vstack((train,test))

del train,test
gc.collect()


In [None]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5


In [None]:
### train denoising autoencoder
from keras.layers import Input, Dense
from keras import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

def get_DAE():
    # denoising autoencoder
    inputs = Input((X.shape[1],))
    x = Dense(1500, activation='relu')(inputs) # 1500 original
    x = Dense(1500, activation='relu', name="feature")(x) # 1500 original
    x = Dense(1500, activation='relu')(x) # 1500 original
    outputs = Dense(X.shape[1], activation='relu')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse')

    return model


def x_generator(x, batch_size, shuffle=True):
    # batch generator of input
    batch_index = 0
    n = x.shape[0]
    while True:
        if batch_index == 0:
            index_array = np.arange(n)
            if shuffle:
                index_array = np.random.permutation(n)

        current_index = (batch_index * batch_size) % n
        if n >= current_index + batch_size:
            current_batch_size = batch_size
            batch_index += 1
        else:
            current_batch_size = n - current_index
            batch_index = 0

        batch_x = x[index_array[current_index: current_index + current_batch_size]]

        yield batch_x


def mix_generator(x, batch_size, swaprate=0.15, shuffle=True):
    # generator of noized input and output
    # swap 0.15% of values of data with values of another
    num_value = X.shape[1]
    num_swap = int(num_value * swaprate)
    gen1 = x_generator(x, batch_size, shuffle)
    gen2 = x_generator(x, batch_size, shuffle)
    while True:
        batch1 = next(gen1)
        batch2 = next(gen2)
        new_batch = batch1.copy().tolil()
        for i in range(batch1.shape[0]):
            swap_idx = np.random.choice(num_value, num_swap, replace=False)
            new_batch[i, swap_idx] = batch2[i, swap_idx]

        yield (new_batch.toarray(), batch1.toarray())
    
def get_callbacks(save_path):
    save_checkpoint = ModelCheckpoint(filepath=save_path, monitor='loss', save_best_only=True)
    early_stopping = EarlyStopping(monitor='loss',
                                   patience=4,
                                   verbose=1,
                                   min_delta=1e-4,
                                   mode='min')
    Callbacks = [ save_checkpoint, early_stopping]
    return Callbacks


In [None]:
# training
batch_size = 128
num_epoch = 15 # 1000 original
#gen = mix_generator(X, batch_size)
callbacks = get_callbacks('weight_dae.hdf5')
dae = get_DAE()
dae.load_weights(f'{MY_DATASET}/weight_dae.hdf5')
#dae.load_weights(f'weight_dae.hdf5')
'''
hist = dae.fit_generator(generator=gen,
                  steps_per_epoch=np.ceil(X.shape[0] / batch_size),
                  epochs=num_epoch,
                  callbacks=callbacks,
                  verbose=1,)
'''

In [None]:
#logger.debug(hist.history)

In [None]:
### train NN with feature of DAE
from keras.layers import Dropout
from  keras.regularizers import l2

def get_NN(DAE):
    l2_loss = l2(0.05)
    DAE.trainable = False
    x = dae.get_layer("feature").output
    x = Dropout(0.1)(x)
    x = Dense(500, activation='relu', kernel_regularizer=l2_loss)(x) # 4500 original
    x = Dropout(0.5)(x)
    x = Dense(100, activation='relu', kernel_regularizer=l2_loss)(x) # 1000 original
    x = Dropout(0.5)(x)
    x = Dense(100, activation='relu', kernel_regularizer=l2_loss)(x) # 1000 original
    x = Dropout(0.5)(x)
    predictions = Dense(1, activation='relu', kernel_regularizer=l2_loss)(x)

    model = Model(inputs=dae.input, outputs=predictions)
    model.compile(loss='mse',optimizer='adam')

    return model


def train_generator(x, y, batch_size, shuffle=True):
    batch_index = 0
    n = x.shape[0]
    while True:
        if batch_index == 0:
            index_array = np.arange(n)
            if shuffle:
                index_array = np.random.permutation(n)

        current_index = (batch_index * batch_size) % n
        if n >= current_index + batch_size:
            current_batch_size = batch_size
            batch_index += 1
        else:
            current_batch_size = n - current_index
            batch_index = 0

        batch_x = x[index_array[current_index: current_index + current_batch_size]]
        batch_y = y[index_array[current_index: current_index + current_batch_size]]

        yield batch_x.toarray(), batch_y


def test_generator(x, batch_size, shuffle=False):
    batch_index = 0
    n = x.shape[0]
    while True:
        if batch_index == 0:
            index_array = np.arange(n)
            if shuffle:
                index_array = np.random.permutation(n)

        current_index = (batch_index * batch_size) % n
        if n >= current_index + batch_size:
            current_batch_size = batch_size
            batch_index += 1
        else:
            current_batch_size = n - current_index
            batch_index = 0

        batch_x = x[index_array[current_index: current_index + current_batch_size]]

        yield batch_x.toarray()

        
    
def get_callbacks(save_path):
    save_checkpoint = ModelCheckpoint(filepath=save_path, monitor='val_loss', save_best_only=True)
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=4,
                                   verbose=1,
                                   min_delta=1e-4,
                                   mode='min')
    Callbacks = [ save_checkpoint, early_stopping]
    return Callbacks

In [None]:
y_train

In [None]:
import types
def get_memory_use():
    for k,v in globals().items():
        if hasattr(v, 'size') and not k.startswith('_') and not isinstance(v,types.ModuleType):
            print(f'{k}|{v.size}')

get_memory_use()

In [None]:

from sklearn.model_selection import KFold

batch_size = 128
num_epoch = 5 # 150 original
num_fold = 5 # 5 original

Y_train = y_train
X_train = X[:Y_train.shape[0]]
X_test = X[Y_train.shape[0]:]
del X
gc.collect()

y_test = np.zeros([num_fold, X_test.shape[0]])
y_valid = np.zeros([X_train.shape[0]])

folds = list(KFold(n_splits=num_fold, shuffle=True, random_state=42).split(X_train))
for j, (ids_train_split, ids_valid_split) in enumerate(folds):
    print("fold", j+1, "==================")
    model = get_NN(dae)

    # Fit model
    if  j==-1:
        ids_train_split=np.loadtxt(f'{MY_DATASET}/ids_train_split_{j}.csv',delimiter=',').astype('int32')
        ids_valid_split=np.loadtxt(f'{MY_DATASET}/ids_valid_split_{j}.csv',delimiter=',').astype('int32')
        callbacks = get_callbacks("weight" + str(j) + ".hdf5")
    else:
        #np.savetxt(f'ids_train_split_{j}.csv',ids_train_split,delimiter=',')
        #np.savetxt(f'ids_valid_split_{j}.csv',ids_valid_split,delimiter=',')
        #continue
        callbacks = [get_callbacks("weight" + str(j) + ".hdf5")[1]]

    if  j==-1 :
        gen_train = train_generator(X_train[ids_train_split], Y_train[ids_train_split], batch_size)
        gen_val = train_generator(X_train[ids_valid_split], Y_train[ids_valid_split], batch_size, shuffle=False)
        callbacks = get_callbacks("weight" + str(j) + ".hdf5")
        hist = model.fit_generator(
            generator=gen_train,
            steps_per_epoch=np.ceil(ids_train_split.shape[0] / batch_size),
            epochs=num_epoch,
            verbose=1,
            callbacks=callbacks,
            validation_data=gen_val,
            validation_steps=np.ceil(ids_valid_split.shape[0] / batch_size),
        )
        logger.debug(hist.history)
        del gen_train,gen_val
        gc.collect()
    else:
        model.load_weights(f'{MY_DATASET}/weight{j}.hdf5') # load best epoch weight
        pass


    
    # Predict on train, val and test
    gen_val_pred = test_generator(X_train[ids_valid_split], batch_size, shuffle=False)
    gen_test_pred = test_generator(X_test, batch_size, shuffle=False)
    
    y_valid[ids_valid_split] = model.predict_generator(generator=gen_val_pred,
                                        steps=np.ceil(ids_valid_split.shape[0] / batch_size))[:,0]
    y_test[j] = model.predict_generator(generator=gen_test_pred,
                                        steps=np.ceil(X_test.shape[0] / batch_size))[:,0]
    del gen_test_pred,gen_val_pred,model
    gc.collect()

score = RMSLE(y_valid, Y_train)
logger.info(f'valid score: {score}')
y_test_mean = np.mean(y_test, axis=0)
np.savetxt(f'dae_nn_train.csv',y_valid,delimiter=',')
np.savetxt(f'dae_nn_test.csv',y_test_mean,delimiter=',')
submission = pd.DataFrame({"item_id":test_id, 'deal_probability': y_test_mean})
submission.to_csv('submission_dae.csv', index=False)
