In [1]:
import pandas as pa
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
import xgboost as xgb


In [2]:
from nltk.corpus import stopwords
stop_words = set (stopwords.words('english'))

In [3]:
train=pa.read_table("../input/train.tsv")

In [4]:
test=pa.read_table("../input/test.tsv")

In [5]:
numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes

In [6]:
train['item_description']=train['item_description'].str.lower()

In [7]:
test['item_description']=test['item_description'].str.lower()

In [8]:
train['item_description']=train['item_description'].replace('[^a-zA-Z]', ' ', regex = True)

In [9]:
test['item_description']=test['item_description'].replace('[^a-zA-Z]', ' ', regex = True)

In [10]:
train.isnull().sum()

In [11]:
train["category_name"].fillna(value='missing/missing/missing', inplace=True)
train["brand_name"].fillna(value="missing", inplace=True)
train["item_description"].fillna(value="No description yet", inplace =True)

In [12]:
test["category_name"].fillna(value='missing/missing/missing', inplace=True)
test["brand_name"].fillna(value="missing", inplace=True)
test["item_description"].fillna(value="No description yet", inplace =True)

In [13]:
train['category_main']=train.category_name.str.split("/").str.get(0)
train['category_sub1']=train.category_name.str.split("/").str.get(1)
train['category_sub2']=train.category_name.str.split("/").str.get(2)

In [14]:
test['category_main']=test.category_name.str.split("/").str.get(0)
test['category_sub1']=test.category_name.str.split("/").str.get(1)
test['category_sub2']=test.category_name.str.split("/").str.get(2)

In [15]:
train.isnull().sum()

In [16]:
test.isnull().sum()

In [17]:
def stop(txt):
    words = [w for w in txt.split(" ") if not w in stop_words and len(w)>2]
    return words

In [18]:
train['tokens']=train['item_description'].map(lambda x:stop(x))

In [19]:
test['tokens']=test['item_description'].map(lambda x:stop(x))

In [20]:
train['desc_len']=train['tokens'].map(lambda x: len(x))

In [21]:
test['desc_len']=test['tokens'].map(lambda x: len(x))

In [22]:
train['name_len']=train['name'].map(lambda x: len(x))

In [23]:
test['name_len']=test['name'].map(lambda x: len(x))

In [24]:
train.head()

In [25]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [26]:
def stemm(text):
    stemmed=[stemmer.stem(w) for w in text]
    return stemmed

In [27]:
train['stemmed']=train['tokens'].map(lambda x: stemm(x))

In [28]:
test['stemmed']=test['tokens'].map(lambda x: stemm(x))

In [29]:
train.head()

In [30]:
def join(txt):
    joinedtext=' '.join(word for word in txt)
    return joinedtext

In [31]:
train['final_desc']=train['stemmed'].map(lambda x: join(x))

In [32]:
test['final_desc']=test['stemmed'].map(lambda x: join(x))

In [33]:
train['final_desc'].head()

In [34]:
test['final_desc'].head()

In [35]:
vectorizer = TfidfVectorizer(min_df=10)
X_tfidf = vectorizer.fit_transform(train['final_desc']) 

In [36]:
X_tfidf.shape

In [37]:
train['name'].shape

In [38]:
#Avectorizer = TfidfVectorizer(min_df=10)
Y_tfidf = vectorizer.transform(test['final_desc']) 

In [39]:
test['name'].shape

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_cols=['name',"brand_name","category_main","category_sub1","category_sub2"]
for col in categorical_cols:
    # taking a column from dataframe, encoding it and replacing same column in the dataframe.
    train[col] = le.fit_transform(train[col])

In [41]:
test.head(2)

In [42]:
categorical_cols=['name',"brand_name","category_main","category_sub1","category_sub2"]
for col in categorical_cols:
    # taking a column from dataframe, encoding it and replacing same column in the dataframe.
    test[col] = le.fit_transform(test[col])

In [43]:
train.head()

In [44]:
y = train['price']

In [45]:
train.columns

In [46]:
test.head(2)

In [47]:
train1=train.drop(train.columns[[0,3,5,7,11,13,14,15]],axis=1)

In [48]:
train1.head(1)

In [49]:
test1=test.drop(test.columns[[0,3,6,8,10,13,14]],axis=1)

In [50]:
test1.head(2)

In [51]:
X = hstack([X_tfidf,train1])

In [None]:
Y = hstack([Y_tfidf,test1])

In [None]:
import time
regr = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.05,
                 max_depth=6,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)

regr.fit(X,y)
rslt = regr.predict(Y)
rmsle(rslt,y)
print(time.clock()-start)


In [None]:
rslt.shape

In [None]:
test.shape

In [None]:
rslt1=pa.DataFrame(pre)

In [None]:
rslt1.columns=["price"]

In [None]:
rslt1["test_id"]=rslt1.index

In [None]:
rslt1.to_csv("submit_submission.csv", index=False, encoding='utf-8')