In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xgboost as xgb
import math
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV

# set seed
np.random.seed(42)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
%%time

trainData = pd.read_table('../input/train.tsv')
testData = pd.read_table('../input/test.tsv')

print(trainData.shape, testData.shape)

In [None]:
trainData = trainData.drop(trainData[(trainData.price < 3.0)].index)
trainData.shape

In [None]:
%%time
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0
trainData['descLen'] = trainData['item_description'].apply(lambda x: wordCount(x))
testData['descLen'] = testData['item_description'].apply(lambda x: wordCount(x))
trainData['nameLen'] = trainData['name'].apply(lambda x: wordCount(x))
testData['nameLen'] = testData['name'].apply(lambda x: wordCount(x))
trainData.head()

In [None]:
%%time
# split category name into 3 parts
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
trainData['subcat_1'], trainData['subcat_2'], trainData['subcat_3'] = \
zip(*trainData['category_name'].apply(lambda x: split_cat(x)))
testData['subcat_1'], testData['subcat_2'], testData['subcat_3'] = \
zip(*testData['category_name'].apply(lambda x: split_cat(x)))

In [None]:
%%time
fullData = pd.concat([trainData,testData])
brands = set(fullData['brand_name'].values)
trainData.brand_name.fillna(value="missing", inplace=True)
testData.brand_name.fillna(value="missing", inplace=True)

missing = len(trainData.loc[trainData['brand_name'] == 'missing'])
def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in brands:
                return name
    if name in brands:
        return name
    return brand
trainData['brand_name'] = trainData[['brand_name','name']].apply(brandfinder, axis = 1)
testData['brand_name'] = testData[['brand_name','name']].apply(brandfinder, axis = 1)
found = missing-len(trainData.loc[trainData['brand_name'] == 'missing'])
print(found)

In [None]:
%%time
# Scale target variable to log.
# trainData["target"] = np.log1p(trainData.price)

# Split training examples into train/dev examples.
trainData, devData = train_test_split(trainData, random_state=42, train_size=0.9)

# Calculate number of train/dev/test examples.
n_trains = trainData.shape[0]
n_devs = devData.shape[0]
n_tests = testData.shape[0]
print("Training on", n_trains, "examples")
print("Validating on", n_devs, "examples")
print("Testing on", n_tests, "examples")

# Concatenate train - dev - test data for easy to handle
fullData = pd.concat([trainData, devData, testData])

In [None]:
%%time

# Filling missing values
def fill_missing_values(df):
    df.category_name.fillna(value="missing", inplace=True)
    df.brand_name.fillna(value="missing", inplace=True)
    df.item_description.fillna(value="missing", inplace=True)
    df.item_description.replace('No description yet',"missing", inplace=True)
    return df

print("Filling missing data ...")
fullData = fill_missing_values(fullData)
print(fullData.category_name[1])

In [None]:
%%time

print("Processing categorical data...")
le = LabelEncoder()

le.fit(fullData.category_name)
fullData['category'] = le.transform(fullData.category_name)

le.fit(fullData.brand_name)
fullData.brand_name = le.transform(fullData.brand_name)

le.fit(fullData.subcat_1)
fullData.subcat_1 = le.transform(fullData.subcat_1)

le.fit(fullData.subcat_2)
fullData.subcat_2 = le.transform(fullData.subcat_2)

le.fit(fullData.subcat_3)
fullData.subcat_3 = le.transform(fullData.subcat_3)

del le

In [None]:
%%time

print("Handling missing values...")
fullData['category_name'] = fullData['category_name'].fillna('missing').astype(str)
fullData['subcat_1'] = fullData['subcat_1'].astype(str)
fullData['subcat_2'] = fullData['subcat_2'].astype(str)
fullData['subcat_3'] = fullData['subcat_3'].astype(str)
fullData['brand_name'] = fullData['brand_name'].fillna('missing').astype(str)
fullData['shipping'] = fullData['shipping'].astype(str)
fullData['item_condition_id'] = fullData['item_condition_id'].astype(str)
fullData['descLen'] = fullData['descLen'].astype(str)
fullData['nameLen'] = fullData['nameLen'].astype(str)
fullData['item_description'] = fullData['item_description'].fillna('No description yet').astype(str)

In [None]:
%%time

print("Vectorizing data...")
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
    field_idx = list(fullData.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),
        max_features=100000,
        stop_words='english',
        preprocessor=build_preprocessor('name'))),
    ('category_name', CountVectorizer(
        token_pattern='.+',
        max_features=20000,
        stop_words='english',
        preprocessor=build_preprocessor('category_name'))),
    ('subcat_1', CountVectorizer(
        token_pattern='.+',
        stop_words='english',
        preprocessor=build_preprocessor('subcat_1'))),
    ('subcat_2', CountVectorizer(
        token_pattern='.+',
        stop_words='english',
        preprocessor=build_preprocessor('subcat_2'))),
    ('subcat_3', CountVectorizer(
        token_pattern='.+',
        stop_words='english',
        max_features=20000,
        preprocessor=build_preprocessor('subcat_3'))),
    ('brand_name', CountVectorizer(
        token_pattern='.+',
        stop_words='english',
        preprocessor=build_preprocessor('brand_name'))),
    ('shipping', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('shipping'))),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('item_condition_id'))),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=20000,
        stop_words='english',
        preprocessor=build_preprocessor('item_description'))),
])

X = vectorizer.fit_transform(fullData.values)

In [None]:
X = sparse.hstack((X, fullData[['nameLen', 'descLen']].astype(float).as_matrix()), format = 'csr')

trainData["target"] = np.log1p(trainData.price)
devData["target"] = np.log1p(devData.price)

X_train = X[:n_trains]
Y_train = trainData.target.values.reshape(-1, 1)

X_dev = X[n_trains:n_trains+n_devs]
Y_dev = devData.target.values.reshape(-1, 1)

X_test = X[n_trains+n_devs:]

print(X.shape, X_train.shape, X_dev.shape, X_test.shape)

In [None]:
# del trainData
# del testData
# del fullData

In [None]:
%%time
%env JOBLIB_TEMP_FOLDER=/tmp

xgb_model = xgb.XGBRegressor()

xgb_parameters = {'n_estimators': [100],
              'subsample': [0.5],
              'colsample_bytree': [0.1],
              'colsample_bylevel': [0.1],
              'reg_lambda': [0.7],
              'reg_alpha': [0.3],
              'seed': [42]}


xgb_clf = GridSearchCV(xgb_model, xgb_parameters, n_jobs=-1, cv=3, 
                   scoring='neg_mean_squared_error')

xgb_clf.fit(X_train, Y_train)

print('XGBoost training score: ', mean_squared_error(Y_train, xgb_clf.predict(X_train)))
print('XGBoost validation score: ', mean_squared_error(Y_dev, xgb_clf.predict(X_dev)))

In [None]:
# xgb_pred_test = np.expm1(xgb_clf.predict(X_test))

# submissionData = pd.DataFrame({
#         "test_id": testData.test_id,
#         "price": xgb_pred_test.reshape(-1),
# })

# submissionData.to_csv("./xgb_submission_first.csv", index=False)

In [None]:
%%time
ridge_model = Ridge(
    fit_intercept=True, alpha=[10.0],
    normalize=False, solver='sag', tol=0.05, random_state=42)

ridge_model.fit(X_train, Y_train)

print('Ridge training score: ', mean_squared_error(Y_train, ridge_model.predict(X_train)))
print('Ridge validation score: ', mean_squared_error(Y_dev, ridge_model.predict(X_dev)))

In [None]:
# ridge_pred_test = np.expm1(ridge_model.predict(X_test))

# submissionData = pd.DataFrame({
#         "test_id": testData.test_id,
#         "price": ridge_pred_test.reshape(-1),
# })

# submissionData.to_csv("./ridge_submission_first.csv", index=False)

In [None]:
%%time

xgb_pred_dev = np.expm1(xgb_clf.predict(X_dev))
ridge_pred_dev = np.expm1(ridge_model.predict(X_dev))

xgb_pred_test = np.expm1(xgb_clf.predict(X_test))
ridge_pred_test = np.expm1(ridge_model.predict(X_test))

def aggregate_predicts2(Y1, Y2,ratio):
    assert Y1.shape == Y2.shape
    return Y1 * ratio + Y2 * (1.0 - ratio)

#ratio optimum finder
best = 0
lowest = 0.99
for i in range(100):
    r = i*0.01
    Y_dev_preds = aggregate_predicts2(xgb_pred_dev, ridge_pred_dev, r)
    fpred = mean_squared_error(Y_dev, Y_dev_preds)
    if fpred < lowest:
        best = r
        lowest = fpred
    print(str(r) + " - score for XGBoost + Ridge on dev set:", fpred)


In [None]:
weighted_preds = aggregate_predicts2(xgb_pred_test, ridge_pred_test, best)

submissionData = pd.DataFrame({
        "test_id": testData.test_id,
        "price": weighted_preds.reshape(-1),
})

submissionData.to_csv("./ridge_xgb_weighted_submission.csv", index=False)