In [1]:
import gc
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

In [2]:
NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [3]:
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing',inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing',inplace=True)
    
def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'

def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

In [4]:
def main():
    start_time = time.time()
    train = pd.read_table('train.tsv',engine='c')
    test = pd.read_table('test.tsv', engine='c')
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: {}'.format(train.shape))
    print('Test shape: {}'.format(test.shape))
    
    nrow_train = train.shape[0]
    y = np.log1p(train['price'])
    merge = pd.concat([train,test])
    submission = test[['test_id']]
    
    del train
    del test
    gc.collect()
    
    handle_missing_inplace(merge)
    print('[{}] Finished to handle missing'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Finished to cut'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Finished to convert categorical'.format(time.time() - start_time))
    
    cv = CountVectorizer(min_df = NAME_MIN_DF)
    X_name = cv.fit_transform(merge['name'])
    print('[{}] Finished count vectorize name'.format(time.time()-start_time))
    
    cv = CountVectorizer()
    X_category = cv.fit_transform(merge['category_name'])
    print('[{}] Finished count vectorize category name'.format(time.time()-start_time))
    
    tv = TfidfVectorizer(max_features = MAX_FEATURES_ITEM_DESCRIPTION,ngram_range=(1,3),stop_words = 'english')
    X_description = tv.fit_transform(merge['item_description'])
    print('[{}] Finished TFIDF vectorize item_description'.format(time.time() - start_time))
    
    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Finished label binarize brand name'.format(time.time()-start_time))
    
    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],sparse=True).values)
    print('[{}] Finished to get dummies on item_condition_id and shipping'.format(time.time()-start_time))
    
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
    
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    
    d_train = lgb.Dataset(X, label=y)
    
    params = { 'learning_rate': 0.75, 'application': 'regression', 'max_depth': 3,
             'num_leaves':100, 'verbosity': -1, 'metric': 'RMSE'}
    model = lgb.train(params, train_set=d_train, num_boost_round = 3200, verbose_eval=100)
    preds = 0.57*model.predict(X_test)
    
    model = Ridge(solver='sag', fit_intercept=True, random_state=205)
    model.fit(X,y)
    print('[{}] Finished to train ridge'.format(time.time() - start_time))
    preds += 0.43*model.predict(X=X_test)
    print('[{}] Finished to predict ridge'.format(time.time() - start_time))
    
    submission['price'] = np.expm1(preds)
    submission.to_csv('submission_lgbm_ridge_5.csv',index=False)

if __name__ == '__main__':
    main()

[7.074125528335571] Finished to load data
Train shape: (1482535, 8)
Test shape: (693359, 7)
[7.84805154800415] Finished to handle missing
[9.070848941802979] Finished to cut
[9.70813250541687] Finished to convert categorical
[21.569344758987427] Finished count vectorize name
[32.161080837249756] Finished count vectorize category name
[225.20961475372314] Finished TFIDF vectorize item_description
[322.9870517253876] Finished label binarize brand name
[326.21738505363464] Finished to get dummies on item_condition_id and shipping
[594.5283801555634] Finished to train ridge
[594.561375617981] Finished to predict ridge
