In [None]:
import gc
import time
from time import gmtime, strftime
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import gensim
import nltk
import wordbatch

from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FTRL, FM_FTRL

from nltk.corpus import stopwords
import re

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
testing = False #set to false when running for output

In [None]:
df_train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv')
df_test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv')

In [None]:
#params
NUM_BRANDS = 2000
NUM_CAT = 1000
MAX_FEAT = 30000
start_time = time.time()
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

In [None]:
df = pd.concat([df_train, df_test], 0)
nrow_train = df_train.shape[0]
y_train = np.log1p(df_train["price"])

del df_train

df["category_name"] = df["category_name"].fillna("Other")
df["brand_name"] = df["brand_name"].fillna("unknown")

pop_brands = df["brand_name"].value_counts().loc[lambda x: x.index != 'unknown'].index[:NUM_BRANDS]
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"

pop_cat = df['category_name'].value_counts().loc[lambda x: x.index != 'Other'].index[:NUM_CAT]
df.loc[~df['category_name'].isin(pop_cat), 'category_name'] = 'Other'

df["item_description"] = df["item_description"].fillna("None")
df["item_condition_id"] = df["item_condition_id"].astype("category")
df["brand_name"] = df["brand_name"].astype("category")
df["category_name"] = df["category_name"].astype("category")

gc.collect()
print('[{}] Filled na'.format(time.time() - start_time))

In [None]:
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [None]:
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                              "hash_size": 2 ** 29, "norm": "l2", "tf": 1.0,
                                                              "idf": None})
                        , procs=8)
wb.dictionary_freeze= True
desc = wb.fit_transform(df['item_description'])
del(wb)
desc = desc[:, np.array(np.clip(desc.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Finished wordbatch'.format(time.time() - start_time))

In [None]:
#transformations
count = CountVectorizer(min_df=10)
names = count.fit_transform(df["name"])

count_cat = CountVectorizer()
cat = count_cat.fit_transform(df["category_name"])

# desc_tfidf = TfidfVectorizer(max_features = MAX_FEAT, 
#                               ngram_range = (1,3),
#                               token_pattern=r'\b\w+\b',
#                               stop_words = "english")
# desc = desc_tfidf.fit_transform(df["item_description"])

brand_bin = LabelBinarizer(sparse_output=True)
brand = brand_bin.fit_transform(df["brand_name"])

dummies = scipy.sparse.csr_matrix(pd.get_dummies(df[["item_condition_id", "shipping"]], sparse = True).values)


X = scipy.sparse.hstack((dummies, 
                         brand,
                         desc,
                         cat,
                         names)).tocsr()
print('[{}] Finished vectorization and sparse matrix stacking'.format(time.time() - start_time))

In [None]:
mask = np.array(np.clip(X.getnnz(axis=0) - 1, 0, 1), dtype=bool)
X = X[:, mask]
X_train = X[:nrow_train]
X_test = X[nrow_train:]

In [None]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

In [None]:
# #3 fold CV ridge
if testing:
    results = []
    #alphas = [0.5, 1, 1.5]
    alphas = [1]
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    for a in alphas:
        a_results = []
        for train_ids, valid_ids in cv.split(X_train):
            model = Ridge(
                solver='auto',
                fit_intercept=True,
                alpha=a,
                max_iter=100,
                normalize=False,
                tol=0.05)
            model.fit(X_train[train_ids], y_train[train_ids])
            y_pred_valid = model.predict(X_train[valid_ids])
            rmsle = get_rmsle(y_pred_valid, y_train[valid_ids])
            a_results.append(rmsle)
        results.append(np.mean(a_results))
        print('alpha is: %f, rmsle: %f' % (a, np.mean(a_results)))
# #Best alpha is 1, best rmsle is 0.467923

In [None]:
# #lgbm cv
if testing:
    results = []
    lrs = [.75,.6,.5]
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    for lr in lrs:
        a_results = []
        for train_ids, valid_ids in cv.split(X_train):
            d_train = lgb.Dataset(X_train[train_ids], label=y_train[train_ids])
            params = {
                'learning_rate': lr,
                'application': 'regression',
                'max_depth': 3,
                'num_leaves': 100,
                'verbosity': -1,
                'metric': 'RMSE',
            }
            model = lgb.train(params, train_set=d_train, num_boost_round=3200, verbose_eval=100)
            y_pred_valid = model.predict(X_train[valid_ids])
            rmsle = get_rmsle(y_pred_valid, y_train[valid_ids])
            a_results.append(rmsle)
        results.append(np.mean(a_results))
        print('lr is: %f, rmsle: %f' % (lr, np.mean(a_results)))

In [None]:
#submission
#ridge
model = Ridge(
    solver='sag',
    fit_intercept=True,
    alpha=1,
    max_iter=300,
    normalize=False,
    tol=0.05)
model.fit(X_train, y_train)
predsR = model.predict(X_test)
print('[{}] Finished Ridge'.format(time.time() - start_time))

In [None]:
#FTRL
model = FTRL(alpha=0.01,
             beta=0.1,
             L1=0.00001,
             L2=1.0,
             D=X.shape[1],
             iters=47,
             inv_link="identity",
             threads=1)
model.fit(X_train, y_train)
predsF = model.predict(X_test)
print('[{}] Finished FTRL'.format(time.time() - start_time))

In [None]:
#FM_FTRL
model = FM_FTRL(alpha=0.01,
                beta=0.1,
                L1=0.00001,
                L2=0.1,
                D=X.shape[1],
                alpha_fm=0.01,
                L2_fm=0.0,
                init_fm=0.01,
                D_fm=200,
                e_noise=0.0001,
                iters=18,
                inv_link="identity",
                threads=4)
model.fit(X_train, y_train)
predsFM = model.predict(X_test)
print('[{}] Finished FM_FTRL'.format(time.time() - start_time))

In [None]:
#lgbm
mask = np.array(np.clip(X.getnnz(axis=0) - 100, 0, 1), dtype=bool)
X = X[:, mask]
X_train = X[:nrow_train]
X_test = X[nrow_train:]
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'learning_rate': 0.57,
    'application': 'regression',
    'max_depth': 5,
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'RMSE',
    'data_random_seed': 1,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'feature_fraction': 0.65,
    'nthread': 4,
    'min_data_in_leaf': 100,
    'max_bin': 31
}
model_lg = lgb.train(params,
                     train_set=d_train,
                     num_boost_round=4500,
                     early_stopping_rounds=1000,
                     valid_sets=d_train,
                     verbose_eval=1000)
predsL = model_lg.predict(X_test)
print('[{}] Finished LGBM'.format(time.time() - start_time))

In [None]:
#weights
r_w = .05
l_w = .22
f_w = .05
fm_w = .68
#preds = l_w*predsL + f_w*predsF + fm_w*predsFM
preds = r_w*predsR +l_w*predsL + f_w*predsF + fm_w*predsFM
preds = np.expm1(preds)

In [None]:
out = pd.DataFrame({'test_id':df_test.test_id,'price':preds})
out.to_csv('preds.csv', index=False)
print('[{}] Wrote predictions'.format(time.time() - start_time))