In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
import time
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score, GridSearchCV


In [None]:
train = pd.read_csv('../input/train.tsv', sep='\t', index_col=0)
test = pd.read_csv('../input/test.tsv', sep='\t', index_col=0)

In [None]:
all_data = pd.concat((train.drop(['price'], axis=1), test), ignore_index=True)
all_data.shape

In [None]:
def cat_no(category):
    try: return len([s.strip() for s in category.split('/')])
    except: return 0

all_data['cat_no'] = all_data.category_name.apply(cat_no)

In [None]:
all_data.cat_no.value_counts()

In [None]:
def split_cat(category) : 
    try: 
        cat_no = len([s.strip() for s in category.split('/')])
        if (cat_no==3):
            return category.split('/')+['NA', 'NA']
        elif (cat_no==4):
            return category.split('/')+['NA']
        elif (cat_no==5):
            return category.split('/')
    except : 
        return ['NA', 'NA', 'NA', 'NA', 'NA']


In [None]:
all_data['cat1'], all_data['cat2'], all_data['cat3'], all_data['cat4'], all_data['cat5'] = \
zip(*all_data['category_name'].apply(split_cat))

In [None]:
all_data.head()

In [None]:
# Remove the 4 rows having null values in item_description from all_data, train, test
all_data.dropna(subset=['item_description'], inplace=True)
train.dropna(subset=['item_description'], inplace=True)
test.dropna(subset=['item_description'], inplace=True)
all_data.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

start = time.clock()
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
item_vectors = tfidf_vec.fit_transform(all_data.item_description)
print ("Time Elapsed : {0}".format(time.clock()-start))


In [None]:
item_vectors.shape

In [None]:
start = time.clock()
tsvd = TruncatedSVD(n_components=50)
red_item_desc = tsvd.fit_transform(item_vectors)
print ("Time elapsed : {0}".format(time.clock()-start))

In [None]:
tsvd.explained_variance_ratio_.cumsum()[49]

It expalins only 13% of the total variance. Now lets do the tfifd vectorizing and truncated SVD transformation for 'name'. 

In [None]:
start = time.clock()
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
name_vectors = tfidf_vec.fit_transform(all_data.name)
print ("Time elapsed : {0}".format(time.clock()-start))

In [None]:
start = time.clock()
tsvd = TruncatedSVD(n_components=100)
red_name = tsvd.fit_transform(name_vectors)
print ("Time elapsed : {0}".format(time.clock()-start))

In [None]:
tsvd.explained_variance_ratio_.cumsum()[99]

100 components only explain 10% of variance in names. Lets integrate them into all_data as features.

In [None]:
print (red_item_desc.shape, all_data.shape, red_name.shape)
train.shape, test.shape

In [None]:
desc_svd_cols = ['item_description_'+str(i) for i in range(1, 51)]
name_svd_cols = ['name_'+str(i) for i in range(1, 101)]
desc_svd_df = pd.DataFrame(red_item_desc, columns=desc_svd_cols)
name_svd_df = pd.DataFrame(red_name, columns=name_svd_cols)

In [None]:
all_data_svd = pd.concat([all_data, desc_svd_df], axis=1)
all_data_svd = pd.concat([all_data_svd, name_svd_df], axis=1)

In [None]:
all_data_svd.brand_name.fillna('NA', inplace=True)
all_data_svd['brand_name'] = all_data_svd['brand_name'].astype('category').cat.codes
all_data_svd['cat1'] = all_data_svd['cat1'].astype('category').cat.codes
all_data_svd['cat2'] = all_data_svd['cat2'].astype('category').cat.codes
all_data_svd['cat3'] = all_data_svd['cat3'].astype('category').cat.codes
all_data_svd['cat4'] = all_data_svd['cat4'].astype('category').cat.codes
all_data_svd['cat5'] = all_data_svd['cat5'].astype('category').cat.codes

In [None]:
train_svd = all_data_svd.loc[:train.shape[0], :]
test_svd = all_data_svd.loc[train.shape[0]:, :]
test_svd.reset_index(drop=True, inplace=True)

In [None]:
train_svd['price'] = train['price']

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt( np.square(np.log(y_pred+1) - np.log(y_true+1)).sum() / y_true.shape[0] )

In [None]:
exclude = ['name','category_name','item_description', 'price']
predictors = list(set(list(train_svd.columns)) - set(exclude))

In [None]:
X = train_svd[predictors]
y = train_svd['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=11 )
X_train.shape, X_test.shape

Now lets start the process of XGBoost. 

In [None]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['price'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            early_stopping_rounds=early_stopping_rounds, show_progress=True)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['price'])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
#     dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    return dtrain_predictions, cvresult
    

In [None]:
xgb1 = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
dtrain_pred, cvresult = modelfit(xgb1, train_svd, predictors)