In [None]:
import pandas as pa
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

In [None]:
from nltk.corpus import stopwords
stop_words = set (stopwords.words('english'))

In [None]:
train=pa.read_table("../input/train.tsv")

In [None]:
test=pa.read_table("../input/test.tsv")

In [None]:
train['item_description']=train['item_description'].str.lower()

In [None]:
test['item_description']=test['item_description'].str.lower()

In [None]:
train['item_description']=train['item_description'].replace('[^a-zA-Z]', ' ', regex = True)

In [None]:
test['item_description']=test['item_description'].replace('[^a-zA-Z]', ' ', regex = True)

In [None]:
train.isnull().sum()

In [None]:
train["category_name"].fillna(value='missing/missing/missing', inplace=True)
train["brand_name"].fillna(value="missing", inplace=True)
train["item_description"].fillna(value="No description yet", inplace =True)

In [None]:
test["category_name"].fillna(value='missing/missing/missing', inplace=True)
test["brand_name"].fillna(value="missing", inplace=True)
test["item_description"].fillna(value="No description yet", inplace =True)

In [None]:
train['category_main']=train.category_name.str.split("/").str.get(0)
train['category_sub1']=train.category_name.str.split("/").str.get(1)
train['category_sub2']=train.category_name.str.split("/").str.get(2)

In [None]:
test['category_main']=test.category_name.str.split("/").str.get(0)
test['category_sub1']=test.category_name.str.split("/").str.get(1)
test['category_sub2']=test.category_name.str.split("/").str.get(2)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
def stop(txt):
    words = [w for w in txt.split(" ") if not w in stop_words and len(w)>2]
    return words

In [None]:
train['tokens']=train['item_description'].map(lambda x:stop(x))

In [None]:
test['tokens']=test['item_description'].map(lambda x:stop(x))

In [None]:
train['desc_len']=train['tokens'].map(lambda x: len(x))

In [None]:
test['desc_len']=test['tokens'].map(lambda x: len(x))

In [None]:
train['name_len']=train['name'].map(lambda x: len(x))

In [None]:
test['name_len']=test['name'].map(lambda x: len(x))

In [None]:
train.head()

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [None]:
def stemm(text):
    stemmed=[stemmer.stem(w) for w in text]
    return stemmed

In [None]:
train['stemmed']=train['tokens'].map(lambda x: stemm(x))

In [None]:
test['stemmed']=test['tokens'].map(lambda x: stemm(x))

In [None]:
train.head()

In [None]:
def join(txt):
    joinedtext=' '.join(word for word in txt)
    return joinedtext

In [None]:
train['final_desc']=train['stemmed'].map(lambda x: join(x))

In [None]:
test['final_desc']=test['stemmed'].map(lambda x: join(x))

In [None]:
train['final_desc'].head()

In [None]:
test['final_desc'].head()

In [None]:
vectorizer = TfidfVectorizer(min_df=10)
X_tfidf = vectorizer.fit_transform(train['final_desc']) 

In [None]:
X_tfidf.shape

In [None]:
train['name'].shape

In [None]:
#Avectorizer = TfidfVectorizer(min_df=10)
Y_tfidf = vectorizer.transform(test['final_desc']) 

In [None]:
test['name'].shape

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_cols=['name',"brand_name","category_main","category_sub1","category_sub2"]
for col in categorical_cols:
    # taking a column from dataframe, encoding it and replacing same column in the dataframe.
    train[col] = le.fit_transform(train[col])

In [None]:
test.head(2)

In [None]:
categorical_cols=['name',"brand_name","category_main","category_sub1","category_sub2"]
for col in categorical_cols:
    # taking a column from dataframe, encoding it and replacing same column in the dataframe.
    test[col] = le.fit_transform(test[col])

In [None]:
train.head()

In [None]:
y = train['price']

In [None]:
train.columns

In [None]:
test.head(2)

In [None]:
train1=train.drop(train.columns[[0,3,5,7,11,13,14,15]],axis=1)

In [None]:
train1.head(1)

In [None]:
test1=test.drop(test.columns[[0,3,6,8,10,13,14]],axis=1)

In [None]:
test1.head(2)

In [None]:
X = hstack([X_tfidf,train1])

In [None]:
Y = hstack([Y_tfidf,test1])

In [None]:
clf = Ridge(alpha=20.0)

In [None]:
import time
start=time.clock()
clf.fit(X, y)
print(time.clock()-start)

In [None]:
import time
start=time.clock()
rslt=clf.predict(Y)
print(time.clock()-start)

In [None]:
rslt.shape

In [None]:
test.shape

In [None]:
rslt1=pa.DataFrame(rslt)

In [None]:
rslt1.columns=["price"]

In [None]:
rslt1["test_id"]=rslt1.index

In [None]:
rslt1.to_csv("sample_submission.csv", encoding='utf-8', index=False)