In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing ,metrics
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
import xgboost as xgb
import lightgbm.sklearn as lgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.utils.testing import all_estimators
import sys

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from scipy import sparse as sp
# Any results you write to the current directory are saved as output.

In [None]:


class wrapped_logger():
    def __init__(self,log_file_name = 'debug.log'):
        from logging import getLogger, StreamHandler,DEBUG,Formatter, FileHandler
        import psutil 
        import time
        self.time=time.time
        self.logger = getLogger(__name__)
        self.START_TIME=self.time()
        self.psutil=psutil

        handler_format = Formatter(f'%(asctime)s -  %(name)s - %(levelname)s - %(message)s')

        handler = StreamHandler()
        handler.setFormatter(handler_format)
        handler.setLevel(DEBUG)
        file_handler = FileHandler(log_file_name)
        file_handler.setFormatter(handler_format)
        file_handler.setLevel(DEBUG)
        self.logger.setLevel(DEBUG)
        self.logger.addHandler(handler)
        self.logger.addHandler(file_handler)
    def info(self,message):
        self.logger.info(self.__message(message))
    def debug(self,message):
        self.logger.debug(self.__message(message))
    def warning(self,message):
        self.logger.warning(self.__message(message))
        
    def __message(self,message):
        return f'{self.time()-self.START_TIME}s - mem usage:{self.psutil.virtual_memory().used/1024/1024} - {message}'

In [None]:
logger=wrapped_logger()
logger.info('start logging')


In [None]:
pd.set_option("display.max_rows",50)
%env JOBLIB_TEMP_FOLDER=/tmp
MY_DATASET='../input/avito-demand-prediction-challenge-private-dataset'
DATASET='../input/avito-demand-prediction'

**List of input files**

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input/"]).decode("utf8"))

In [None]:
FRAC = 0.1
def read_csv(frac,fanc):
    data = fanc
    if FRAC != 1: data = data.sample(frac = FRAC).reset_index(drop=True)
    return data

In [None]:
train = read_csv(FRAC,pd.read_csv(f'{DATASET}/train.csv', parse_dates=["activation_date"]))
test = read_csv(FRAC,pd.read_csv(f'{DATASET}/test.csv', parse_dates=["activation_date"]))

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.sample(10)


In [None]:
train.describe()

In [None]:
def fill_na(col_names,train,test,fill_what):
    for col_name in col_names:
        train[col_name].fillna(fill_what, inplace=True)
        test[col_name].fillna(fill_what, inplace=True)
    return train, test

In [None]:
def get_len(col_names):
    for col_name in col_names:
        train[f'{col_name}_len'] = train[col_name].apply(lambda x: len(x.split()))
        test[f'{col_name}_len'] = test[col_name].apply(lambda x: len(x.split()))


In [None]:
#train['city'] = train['city'] + "_" + train['region']
#test['city'] = test['city'] + "_" + test['region']
full_data = pd.concat([train, test], axis = 0)
col_names_fillna = [
    'description',
    'title',
    "param_1", 
    "param_2", 
    "param_3",
    'city',
    "region",
    "parent_category_name", 
    "category_name", 
    "user_type"
    
]
train,test=fill_na(col_names_fillna,train,test,'NaN')
train,test=fill_na(['price'],train,test,full_data["price"].mean())
train,test=fill_na(['image_top_1'],train,test,full_data["image_top_1"].mode()[0])

col_names_len = [
    'description',
    'title',
    "param_1", 
    "param_2", 
    "param_3"
]

get_len(col_names_len)

y_train =train["deal_probability"].ravel()

test_id = test["item_id"].values

cols_to_drop = ["item_id", 'image']
train = train.drop(cols_to_drop + ["deal_probability"], axis = 1)
test = test.drop(cols_to_drop, axis = 1)
del full_data


In [None]:
def get_svd(fit_data, transform_data, n_comp, col_name):
    #get svd
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(fit_data)
    df_svd = pd.DataFrame(svd_obj.transform(transform_data))
    df_svd.columns = ['svd_' + col_name + '_'+str(i+1) for i in range(n_comp)]
    return df_svd

In [None]:
def get_tfidf_svd(train, test, col_name):
    #get tfidf
    tfidf_vec = TfidfVectorizer(ngram_range=(1,1), max_features=100000)
    full_tfidf = tfidf_vec.fit_transform(pd.concat([train[col_name], test[col_name]], axis=0))
    train_tfidf = tfidf_vec.transform(train[col_name])
    test_tfidf = tfidf_vec.transform(test[col_name])
    
    #get svd
    n_comp = 3
    train_svd = get_svd(full_tfidf, train_tfidf, n_comp,col_name)
    test_svd = get_svd(full_tfidf, test_tfidf, n_comp,col_name)
    
    #clean
    del full_tfidf, train_tfidf, test_tfidf, tfidf_vec
    
    return train_svd, test_svd

In [None]:
def engineering_tfidf(how,col_name):
    if how == 'read': 
        logger.info(f'Read {col_name}')
        train_svd = read_csv(FRAC,pd.read_csv(f'{MY_DATASET}/train_{col_name}_svd.csv'))
        test_svd = read_csv(FRAC,pd.read_csv(f'{MY_DATASET}/test_{col_name}_svd.csv'))
    else:
        train_svd, test_svd = get_tfidf_svd(train = train, test = test, col_name = col_name)
        train_svd.to_csv(f'train_{col_name}_svd.csv',index=False)
        test_svd.to_csv(f'test_{col_name}_svd.csv',index=False)
    return train_svd, test_svd


In [None]:
train_title_svd, test_title_svd = engineering_tfidf('read','title')
train_description_svd, test_description_svd = engineering_tfidf('read','description')

train = pd.concat([train, train_title_svd,train_description_svd], axis=1)
test = pd.concat([test, test_title_svd,test_description_svd], axis=1)

cols_to_drop = ['description', 'title']
train = train.drop(cols_to_drop, axis = 1)
test = test.drop(cols_to_drop, axis = 1)


In [None]:
def num_to_cat(col_names,train,test,n_comp=-1):
    
    for col_name in col_names:
        full_col = pd.concat([train[col_name], test[col_name]])
        n_unique=full_col.nunique(dropna=False)
        le = preprocessing.LabelEncoder()
        oe = preprocessing.OneHotEncoder()
        le.fit(full_col.values.astype('str'))

        train_le = le.transform(train[col_name].values.astype('str')).reshape(-1,1)
        test_le = le.transform(test[col_name].values.astype('str')).reshape(-1,1)
        full_le = np.append(train_le , test_le).reshape(-1,1)
        oe.fit(full_le)
        train_oe = oe.transform(train_le)
        test_oe = oe.transform(test_le)
        
        full_oe =sp.vstack((train_oe  ,test_oe))
        if n_unique <= n_comp or n_comp == -1: 
            col_names_svd=[f'le_{col_name}_{i}' for i in range(1, n_unique+1) ]
            train_svd=pd.SparseDataFrame(train_oe, columns=col_names_svd)
            test_svd=pd.SparseDataFrame(test_oe, columns=col_names_svd)
            #train_svd,test_svd=fill_na(col_names_svd,train_svd,test_svd,0)
        else:
            train_svd=get_svd(full_oe, train_oe, n_comp, col_name)
            test_svd=get_svd(full_oe, test_oe, n_comp, col_name)
        train = pd.concat([train,train_svd], axis=1)
        test = pd.concat([test, test_svd], axis=1)
        
        train = train.drop(col_name, axis = 1)
        test = test.drop(col_name, axis = 1)

    return train,test

In [None]:
def num_to_label(col_names,train,test):
    for col in col_names:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
        train[col] = lbl.transform(list(train[col].values.astype('str')))
        test[col] = lbl.transform(list(test[col].values.astype('str')))
    return train,test

In [None]:
train["activation_weekday"] = train["activation_date"].dt.weekday
test["activation_weekday"] = test["activation_date"].dt.weekday
train["activation_month"] = train["activation_date"].dt.month
test["activation_month"] = test["activation_date"].dt.month
cols_to_drop = [
    "activation_date",
    'user_id'
]
train = train.drop(cols_to_drop, axis = 1)
test = test.drop(cols_to_drop, axis = 1)
#train.to_sparse(fill_value=0)
#test.to_sparse(fill_value=0)
col_vars = [
    #'user_id',
    "region", 
    "city", 
    "parent_category_name", 
    "category_name", 
    "user_type", 
    "param_1", 
    "param_2", 
    "param_3"
]
#train = pd.get_dummies(train, columns = col_vars, dtype = 'int64')
#test = pd.get_dummies(test, columns = col_vars, dtype = 'int64')
#train,test=num_to_cat(col_vars,train,test)
train,test=num_to_label(col_vars,train,test)

train.info()
train = train.to_sparse(fill_value=0)
test=test.to_sparse(fill_value=0)


In [None]:
def get_zscore(train,test,col_names):
    full_data = pd.concat([train, test], axis = 0)
    for col in col_names:
        #full_data[col] =zscore(full_data[col])
        full_data[col] =((full_data[col]-full_data[col].mean())/full_data[col].std())
        train[col]=full_data[col][:train.shape[0]]
        test[col] =full_data[col][train.shape[0]:]
        
    del full_data

    return train, test

In [None]:
zscore_cols=[
    'price',

]
train,test=get_zscore(train,test,zscore_cols)

In [None]:
train.head()

In [None]:
x_train = train.to_coo().tocsr()
x_test = test.to_coo().tocsr()

In [None]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NSPLITS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NSPLITS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None, no_seed=False):
        params['random_state'] = seed 
        if no_seed ==  True: del params['random_state']
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    


In [None]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NSPLITS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:


# ElasticNet Regression Parameters
enet_params = {
    'n_estimators': 200,
    'n_jobs': 4
}

# Lasso Regression parameters
lasso_params = {
    'min_samples_leaf': 2, 
    'min_samples_split': 3, 
    'n_estimators': 15, 
    'n_jobs': 4
}

# Ridge Regression parameters
ridge_params = {
    'min_samples_leaf': 2, 
    'min_samples_split': 2, 
    'n_estimators': 15, 
    'n_jobs': 4
}

# Random Forest Regression parameters 
rf_params = {
    'max_features': 1.0, 
    'max_samples': 0.5, 
    'n_estimators': 15, 
    'n_jobs': 4
}
    

ada_params={
    
}
rd_params={
    
}

In [None]:
lgbm = SklearnHelper(clf=lgb.LGBMRegressor, seed=SEED, params=enet_params)
rf = SklearnHelper(clf=sklearn.ensemble.forest.RandomForestRegressor, seed=SEED, params=lasso_params)
etr = SklearnHelper(clf=sklearn.ensemble.forest.ExtraTreesRegressor, seed=SEED, params=ridge_params)
br = SklearnHelper(clf=sklearn.ensemble.bagging.BaggingRegressor, seed=SEED, params=rf_params)
ada = SklearnHelper(clf=sklearn.ensemble.weight_boosting.AdaBoostRegressor, seed=SEED, params=ada_params)
rd = SklearnHelper(clf=sklearn.linear_model.ridge.Ridge, seed=SEED, params=rd_params)

 

In [None]:
def engineering_oof(how,model_name,func,*args):
    if how == 'read':
        logger.info(f'start reading {model_name}*.csv')
        oof_train = read_csv(FRAC,pd.DataFrame(np.loadtxt(f'{MY_DATASET}/{model_name}_train.csv',delimiter=','))).values
        oof_test = read_csv(FRAC,pd.DataFrame(np.loadtxt(f'{MY_DATASET}/{model_name}_test.csv',delimiter=','))).values
        logger.info(f'finish reading {model_name}*.csv')
    else:
        logger.info(f'start writing {model_name}*.csv')
        oof_train, oof_test = func(*args)
        if FRAC ==1:
            np.savetxt(f'{model_name}_train.csv',oof_train,delimiter=',')
            np.savetxt(f'{model_name}_test.csv',oof_test,delimiter=',')
            logger.info(f'finish writing {model_name}*.csv')
        else:
            logger.info(f'skip writing {model_name}*.csv  FRAC!=1')

    return  sp.csr_matrix(oof_train), sp.csr_matrix(oof_test)
        
        

In [None]:


# Create our OOF train and test predictions. These base results will be used as new features
lgbm_oof_train,lgbm_oof_test = engineering_oof('read', 'LightGBMRegressor',get_oof,*(lgbm,x_train, y_train, x_test))
rf_oof_train,rf_oof_test = engineering_oof('read', 'RandomForestRegressor',get_oof,*(rf,x_train, y_train, x_test))
etr_oof_train,etr_oof_test = engineering_oof('read', 'ExtraTreesRegressor',get_oof,*(etr,x_train, y_train, x_test))
br_oof_train,br_oof_test = engineering_oof('read', 'BaggingRegressor',get_oof,*(br,x_train, y_train, x_test))
nn_oof_train,nn_oof_test = engineering_oof('read', 'dae_nn',logger.debug,*(f'only reading'))
ada_oof_train,ada_oof_test = engineering_oof('read', 'AdaBoostRegressor',get_oof,*(ada,x_train, y_train, x_test))
rd_oof_train,rd_oof_test = engineering_oof('read', 'Ridge',get_oof,*(rd,x_train, y_train, x_test))
gd_oof_train,gd_oof_test =engineering_oof('read', 'GradientBoostingRegressor',logger.debug,*(f'only reading'))
print("Training is complete")



In [None]:

x_train = sp.hstack([
    x_train, 
    lgbm_oof_train, 
    rf_oof_train,
    etr_oof_train,
    br_oof_train,
    nn_oof_train,
    ada_oof_train,
    rd_oof_train,
    gd_oof_train
])
x_test = sp.hstack([
    x_test,
    lgbm_oof_test,
    rf_oof_test,
    etr_oof_test,
    br_oof_test,
    nn_oof_test,
    ada_oof_test,
    rd_oof_test,
    gd_oof_test
])


In [None]:
x_train=x_train.tocsr()

In [None]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

In [None]:
'''
from catboost import CatBoostRegressor 
num_fold=5
y_test = np.zeros([num_fold, x_test.shape[0]])
y_valid = np.zeros([x_train.shape[0]])
folds = list(KFold(n_splits=num_fold, shuffle=True, random_state=42).split(x_train))
for j, (ids_train_split, ids_valid_split) in enumerate(folds):
    print("fold", j+1, "==================")
    cbr=CatBoostRegressor(loss_function='RMSE')
    cat_cols=[0,1,2,3,4,5,6,8,9,10,36,37]
    cbr.fit(x_train[ids_train_split].toarray(),y_train[ids_train_split],cat_cols)

    
    # Predict on train, val and test
    y_valid[ids_valid_split] = cbr.predict(x_train[ids_valid_split].toarray())
    y_test[j] = cbr.predict(x_test.toarray())

score = RMSLE(y_valid, y_train)
logger.info(f'valid score: {score}')
y_test_mean = np.mean(y_test, axis=0)
y_test_mean[y_test_mean>1] = 1
y_test_mean[y_test_mean<0] = 0
Submission = pd.DataFrame({"item_id":test_id, 'deal_probability': y_test_mean})
np.savetxt(f'CatBoostRegressor_train.csv',y_valid,delimiter=',')
np.savetxt(f'CatBoostRegressor_test.csv',y_test_mean,delimiter=',')

Submission.to_csv('Submission.csv', index=False)
logger.info(f'finished')
'''

In [None]:
import lightgbm 
num_fold=10
y_test = np.zeros([num_fold, x_test.shape[0]])
y_valid = np.zeros([x_train.shape[0]])
folds = list(KFold(n_splits=num_fold, shuffle=True, random_state=42).split(x_train))
for j, (ids_train_split, ids_valid_split) in enumerate(folds):
    print("fold", j+1, "==================")
    dat=lightgbm.Dataset(
        x_train[ids_train_split],
        label=y_train[ids_train_split],
        categorical_feature=[0,1,2,3,4,5,6,8,9,10,36,37]
    )
    params={
        #'max_bin':500,
        #'learning_rate': 0.05,
        #'num_iterations':200,
        'n_estimators': 200,
        'n_jobs': 4,
        #'boosting': 'dart'
    }
    lgbm = lightgbm.train(params,dat)
    
    # Predict on train, val and test
    y_valid[ids_valid_split] = lgbm.predict(x_train[ids_valid_split])
    y_test[j] = lgbm.predict(x_test)

score = RMSLE(y_valid, y_train)
logger.info(f'valid score: {score}')
y_test_mean = np.mean(y_test, axis=0)
y_test_mean[y_test_mean>1] = 1
y_test_mean[y_test_mean<0] = 0
Submission = pd.DataFrame({"item_id":test_id, 'deal_probability': y_test_mean})
#np.savetxt(f'LightGBMRegressor_train.csv',y_valid,delimiter=',')
#np.savetxt(f'LightGBMRegressor_test.csv',y_test_mean,delimiter=',')

Submission.to_csv('Submission.csv', index=False)
logger.info(f'finished')


In [None]:

Submission.head()

In [None]:
Submission.describe()