# 1) Import Libraries

In [None]:
import numpy as np
import pandas as pd

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import operator
from sklearn.model_selection import KFold

# 2) Load data

In [None]:
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv', sep='\t')

In [None]:
train.describe(include="all")

# 3) Data Analysis

In [None]:
#get a list of the features within the dataset
print(train.columns)

In [None]:
#see a sample of the dataset to get an idea of the variables
train.sample(5)

Numerical Features: train_id, item_condition_id, price, shipping

Categorical Features: name, category_name, brand_name, item_description

In [None]:
#check for any other unusable values
print(pd.isnull(train).sum())

Around 45% of brand_name is missing. 

# 4) Data Visualization

## Name

In [None]:
from wordcloud import WordCloud

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(train['name']))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## item_condition_id feature

In [None]:
#draw a bar plot of item_condition_id by price
sns.barplot(x="item_condition_id", y="price", data=train)

## shipping feature

In [None]:
#draw a bar plot of shipping by price
sns.barplot(x="shipping", y="price", data=train)

# 4) Feature Engineering

## Name

In [None]:
#Add new feature with length of the name
train['name_len_words'] = train['name'].apply(lambda x: len(x.split()))
test['name_len_words'] = test['name'].apply(lambda x : len(x.split()))

train['name_len_chars'] = train['name'].apply(lambda x: len(x))
test['name_len_chars'] = test['name'].apply(lambda x : len(x))

In [None]:
train = train.drop('name', axis = 1)
test = test.drop('name', axis = 1)

## Category Name

In [None]:
#print category name and its frequency
category_dict = train['category_name'].value_counts().to_dict()

In [None]:
sorted_category_dict = sorted(category_dict.items(), key=operator.itemgetter(1))

In [None]:
sorted_category_dict[::-1]
sorted_category_list = [v[0] for v in sorted_category_dict]

In [None]:
train['category_name'] = train['category_name'].fillna(pd.Series(np.random.choice(sorted_category_list[0:100], size=len(train.index))))
test['category_name'] = test['category_name'].fillna(pd.Series(np.random.choice(sorted_category_list[0:100], size=len(test.index))))

In [None]:
#check for any other unusable values
print(pd.isnull(train).sum())

In [None]:
#Add column with number of elements in category_name
train['category_name_elements'] = train['category_name'].apply(lambda x: len(x))
test['category_name_elements'] = test['category_name'].apply(lambda x: len(x))

In [None]:
train.sample(5)

In [None]:
train = train.drop('category_name', axis = 1)
test = test.drop('category_name', axis = 1)

## brand_name feature

In [None]:
# Drop brand name column as more than half of values are 'NA'
train = train.drop('brand_name', axis=1)
test = test.drop('brand_name', axis=1)

## item_description feature

In [None]:
#Only 4 elements are None, so fill them with some random string
train['item_description'] = train['item_description'].fillna('Description Not available')
test['item_description'] = test['item_description'].fillna('Description Not available')

In [None]:
#Length of description and number of words as new parameters
train['item_desc_words'] = train['item_description'].apply(lambda x: len(x.split()))
test['item_desc_words'] = test['item_description'].apply(lambda x: len(x.split()))

train['item_desc_chars'] = train['item_description'].apply(lambda x: len(x))
test['item_desc_chars'] = test['item_description'].apply(lambda x: len(x))

In [None]:
train.sample(5)

In [None]:
train = train.drop('item_description', axis=1)
test = test.drop('item_description', axis=1)

In [None]:
train.sample(5)

# 5) Divide data into train and valid

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(['train_id', 'price'], axis=1), train['price'], test_size=0.10, random_state=42)

In [None]:
X_train.shape

# 6) RMSE

In [None]:
def calculate_RMSE(y, y_pred):
    k = len(y)
    s = 0
    for i in range(k):
        s += np.square((np.log10(y_pred[i] + 1) - np.log10(y[i] + 1)))
    s = s/k
    return np.sqrt(s)

# 7) Regression Algorithms

In [None]:
from sklearn import linear_model

In [None]:
clf = linear_model.BayesianRidge()

In [None]:
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_test)

In [None]:
print (calculate_RMSE(y_test.tolist(), preds))

In [None]:
target = test['test_id']
predictions = clf.predict(test.drop(['test_id'], axis=1))
predictions = predictions/5.0

In [None]:
submission = pd.DataFrame()
submission['test_id'] = target
submission['price'] = predictions

In [None]:
submission.to_csv('submission.csv', index = False)

## XGB

In [None]:
import xgboost as xgb

In [None]:
xgb_params= {  
            'eta': 0.7,
            'max_depth': 12,
            'objective':'reg:linear',
            'eval_metric':'rmse',
            'silent': 1
}

In [None]:
kf = KFold(n_splits = 5, random_state = 1, shuffle = True)

In [None]:
train_matrix = xgb.DMatrix(X_train, y_train)

In [None]:
validation_matrix = xgb.DMatrix(X_test, y_test)

In [None]:
evallist  = [(validation_matrix,'validation')]

In [None]:
model = xgb.train(xgb_params, train_matrix, 10, evallist, verbose_eval=2)

In [None]:
preds = model.predict(xgb.DMatrix(test.drop(['test_id'], axis=1)), ntree_limit=model.best_ntree_limit);

In [None]:
print (preds)

In [None]:
submission = pd.DataFrame()
submission['test_id'] = test['test_id']
submission['price'] = preds

In [None]:
submission.to_csv('submission.csv', index = False)

## RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regr = RandomForestRegressor(max_depth=2, random_state=0)

In [None]:
regr.fit(train.drop(['train_id', 'price'], axis=1), train['price'])

In [None]:
preds = regr.predict(test.drop(['test_id'], axis=1))

In [None]:
print (calculate_RMSE(y_test.tolist(), preds))

In [None]:
submission = pd.DataFrame()
submission['test_id'] = test['test_id']
submission['price'] = preds

In [None]:
submission.to_csv('submission.csv', index = False)