In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv', sep='\t')

In [None]:
train.head()

In [None]:
df = train.copy()

In [None]:
df["brand_name"].describe()

## let's check the state of the data

In [None]:
for c in df.columns:
    print(c, sum(df[c].isnull()) / len(df))

## brand_name has a lot of missing values. it may be useful to know if there is a value there though

In [None]:
df["brand_name"] = df["brand_name"].fillna(0)
df["brand_name"] = df["brand_name"].apply(lambda x: 1 if x else 0)
df.head()

## what proportion of the item_description column has no values?

In [None]:
len(df[df["item_description"] == "No description yet"]) / len(df)

## how many categories?

In [None]:
df.category_name.describe()

## there are a lot of unique values. maybe we can merge this column with the description and name?

In [None]:
# first we should remove the "no item description" tags
df["item_description"] = df["item_description"].apply(lambda x: "" if x == "No description yet" else x)
df["text"] = df["name"] + " " + df["category_name"] + " " + df["item_description"]
df.drop(["name", "item_description", "category_name"], axis=1, inplace=True)

## one-hot encode the condition variable

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
print("unique values: {}".format(len(df["item_condition_id"].unique())))
new_cols = ["condition_{}".format(i) for i in range(1, 6)]
dummies = pd.get_dummies(df["item_condition_id"])
dummy_df = pd.DataFrame(dummies.values)
dummy_df.columns = new_cols
df.drop("item_condition_id", axis=1, inplace=True)
df = df.join(dummy_df)
df

## let's make it all lowercase and re-index

In [None]:
df.index = df["train_id"]
df.drop("train_id", axis=1, inplace=True)
df["text"] = df["text"].str.lower()
df.head()

## let's make a very simple model using only condition, brand name and shipping

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.svm import SVR
from sklearn import metrics

In [None]:
X = df.drop("name price text".split(), axis=1).values
y = df["price"].values
Xtr, Xte, ytr, yte = train_test_split(X, y)

In [None]:
print(Xtr.shape, ytr.shape)

## we should scale the target data

In [None]:
ytr_scaled = scaler.fit_transform(ytr.reshape(-1, 1))
yte_scaled = scaler.transform(yte.reshape(-1, 1))

In [None]:
## now fit the model and see how it does

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor as XGBR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# try without scaling first
xgtrain = xgb.DMatrix(Xtr, label=ytr_scaled)
xgtest = xgb.DMatrix(Xte, label=yte_scaled)
params = {#"objective": "",
         "eta": 0.1,
         "max_depth": 6,
         "nthread": 7}
watchlist = [(xgtrain, "train"),
            (xgtest, "test")]
num_round = 50

In [None]:
bst = xgb.train(params, xgtrain, num_round, watchlist)

In [None]:
predictions = bst.predict(xgtest).reshape(-1, 1)
yte = yte.reshape(-1, 1)

In [None]:
for i in range(0, len(predictions), 10000):
    print("predicted: {}, actual: {}".format(scaler.inverse_transform(predictions[i]),
                                             yte[i]))

In [None]:
knn = KNeighborsRegressor()
print(metrics.mean_squared_error(yte, knn.fit(Xtr, ytr).predict(Xte)))

## now let's make a word-embedding model

In [None]:
df.head()

In [None]:
df["text"] = df["text"].apply(lambda x: str(x))

In [None]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

In [None]:
from random import sample
train = sample(range(len(df)), int(0.75 * len(df)))
test = list(set(range(len(df))).difference(set(train)))
dftr = df.copy().iloc[train]
dfte = df.copy().iloc[test]

In [None]:
dftr

## first step is to fit the tokenizer on the training text

In [None]:
tokenizer = Tokenizer() # removes symbols by default
tokenizer.fit_on_texts(dftr["text"])

## now we can convert the words to tokens

In [None]:
dftr["tokens"] = tokenizer.texts_to_sequences(dftr["text"])

In [None]:
dfte["tokens"] = tokenizer.texts_to_sequences(dfte["text"])

## since each row has text of differing lengths, we should standardise lengths

In [None]:
# first find out the typical lengths
lengths = dftr["tokens"].apply(lambda x: len(x))

In [None]:
print(lengths.mean())
print(lengths.median())
print(sum([1 for x in lengths if x > 100]) / len(lengths))
# only 5% over 100 in length

In [None]:
# split off the tokens for padding
maxlen = 100
Xtr_tokens = pad_sequences(sequences=dftr["tokens"].values, maxlen=maxlen)
Xte_tokens = pad_sequences(sequences=dfte["tokens"].values, maxlen=maxlen)

In [None]:
dftr.columns

In [None]:
# now extract the other data and concatenate the arrays
Xtr_other = dftr.drop("price text tokens".split(), axis=1).values
Xte_other = dfte.drop("price text tokens".split(), axis=1).values

In [None]:
Xtr = np.concatenate([Xtr_other, Xtr_tokens], axis=1)
Xte = np.concatenate([Xte_other, Xte_tokens], axis=1)

In [None]:
ytr = dftr["price"].values.reshape(-1, 1)
yte = dfte["price"].values.reshape(-1, 1)
scaler = StandardScaler()
ytr_s = scaler.fit_transform(ytr)
yte_s = scaler.transform(yte)

In [None]:
[x.shape for x in (Xtr, ytr_s, Xte, yte_s)]

## now we have training and test data, with all the data encoded, and price as the target variable

## we can build a FastText regressor with Keras

In [None]:
input_dim = np.max(Xtr) + 1
embedding_dims = 20
print("input dim:", input_dim)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping

In [None]:
model = Sequential([
    Embedding(input_dim=input_dim, output_dim=embedding_dims),
    GlobalAveragePooling1D(),
    Dense(1)
])

model.compile(loss="mean_squared_error",
             optimizer="adam",
             metrics=["mse", "mae"])

In [None]:
train_data = model.fit(Xtr[:100000], ytr_s[:100000], batch_size=100, validation_data=(Xte[:10000], yte_s[:10000]),
                      epochs=10, callbacks=[EarlyStopping(patience=2, monitor="val_loss")],
                      verbose=1)

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
plt.rcParams["figure.figsize"] = (12, 9)
for k in train_data.history.keys():
    plt.plot(range(len(train_data.history["val_loss"])), train_data.history[k])
plt.show()

In [None]:
preds = scaler.inverse_transform(model.predict(Xte))

In [None]:
errors = np.log(abs((preds - yte) / (yte + 0.0001)))

In [None]:
np.max(errors)

In [None]:
hist, bins = np.histogram(errors, bins=25)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.show()

In [None]:
errors