In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_log_error
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
import lightgbm as lgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

sample_submission.csv
test.tsv
train.tsv





In [2]:
df_train = pd.read_table('../input/train.tsv')
df_test = pd.read_table('../input/test.tsv')

In [3]:
#params
NUM_BRANDS = 2000
NUM_CAT = 1000
MAX_FEAT = 3

In [4]:
df = pd.concat([df_train, df_test], 0)
nrow_train = df_train.shape[0]
y_train = np.log1p(df_train["price"])

del df_train

df["category_name"] = df["category_name"].fillna("Other")
df["brand_name"] = df["brand_name"].fillna("unknown")

pop_brands = df["brand_name"].value_counts().loc[lambda x: x.index != 'unknown'].index[:NUM_BRANDS]
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"

pop_cat = df['category_name'].value_counts().loc[lambda x: x.index != 'Other'].index[:NUM_CAT]
df.loc[~df['category_name'].isin(pop_cat), 'category_name'] = 'Other'

df["item_description"] = df["item_description"].fillna("None")
df["item_condition_id"] = df["item_condition_id"].astype("category")
df["brand_name"] = df["brand_name"].astype("category")
df["category_name"] = df["category_name"].astype("category")

In [5]:
#df.head()

In [6]:
#transformations
count = CountVectorizer(min_df=10)
names = count.fit_transform(df["name"])

count_cat = CountVectorizer()
cat = count_cat.fit_transform(df["category_name"])

desc_tfidf = TfidfVectorizer(max_features = MAX_FEAT, 
                              ngram_range = (1,3),
                              token_pattern=r'\b\w+\b',
                              stop_words = "english")
desc = desc_tfidf.fit_transform(df["item_description"])

brand_bin = LabelBinarizer(sparse_output=True)
brand = brand_bin.fit_transform(df["brand_name"])

dummies = scipy.sparse.csr_matrix(pd.get_dummies(df[["item_condition_id", "shipping"]], sparse = True).values)


X = scipy.sparse.hstack((dummies, 
                         brand,
                         desc,
                         cat,
                         names)).tocsr()

In [7]:
X_train = X[:nrow_train]
X_test = X[nrow_train:]

In [8]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

In [None]:
# #3 fold CV ridge
# results = []
# #alphas = [0.5, 1, 1.5]
# alphas = [1]
# cv = KFold(n_splits=3, shuffle=True, random_state=42)
# for a in alphas:
#     a_results = []
#     for train_ids, valid_ids in cv.split(X_train):
#         model = Ridge(
#             solver='auto',
#             fit_intercept=True,
#             alpha=a,
#             max_iter=100,
#             normalize=False,
#             tol=0.05)
#         model.fit(X_train[train_ids], y_train[train_ids])
#         y_pred_valid = model.predict(X_train[valid_ids])
#         rmsle = get_rmsle(y_pred_valid, y_train[valid_ids])
#         a_results.append(rmsle)
#     results.append(np.mean(a_results))
#     print('alpha is: %f, rmsle: %f' % (a, np.mean(a_results)))
# #Best alpha is 1, best rmsle is 0.467923

In [None]:
# #lgbm cv
# results = []
# lrs = [.75,.6,.5]
# cv = KFold(n_splits=3, shuffle=True, random_state=42)
# for lr in lrs:
#     a_results = []
#     for train_ids, valid_ids in cv.split(X_train):
#         d_train = lgb.Dataset(X_train[train_ids], label=y_train[train_ids])
#         params = {
#             'learning_rate': lr,
#             'application': 'regression',
#             'max_depth': 3,
#             'num_leaves': 100,
#             'verbosity': -1,
#             'metric': 'RMSE',
#         }
#         model = lgb.train(params, train_set=d_train, num_boost_round=3200, verbose_eval=100)
#         y_pred_valid = model.predict(X_train[valid_ids])
#         rmsle = get_rmsle(y_pred_valid, y_train[valid_ids])
#         a_results.append(rmsle)
#     results.append(np.mean(a_results))
#     print('lr is: %f, rmsle: %f' % (lr, np.mean(a_results)))

In [10]:
#submission
#ridge
model_r = Ridge(
    solver='auto',
    fit_intercept=True,
    alpha=1,
    max_iter=100,
    normalize=False,
    tol=0.05)
model_r.fit(X_train, y_train)
preds_r = model_r.predict(X_test)

In [None]:
#lgbm
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'learning_rate': .75,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 100,
    'verbosity': -1,
    'metric': 'RMSE',
}
model_lg = lgb.train(params, train_set=d_train, num_boost_round=3200, verbose_eval=100)
preds_lg = model_lg.predict(X_test)

In [None]:
#weights
ridge_weight = .4
lgbm_weight = .6
preds = ridge_weight*preds_r + lgbm_weight*preds_lg
preds = np.expm1(preds)

In [None]:
out = pd.DataFrame({'test_id':df_test.test_id,'price':preds})
out.to_csv('preds.csv', index=False)