In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
from stop_words import get_stop_words
import re
from tqdm.notebook import tqdm
import itertools
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
import math

In [None]:
train = pd.read_csv('/kaggle/input/mercari/train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/mercari/test.tsv' , sep='\t')

In [None]:
def split_categories(category):
    try:
        main_cat1,sub_cat1,sub_cat2 = category.split("/")
        return main_cat1,sub_cat1,sub_cat2
    except:
        return 'No Category','No Category','No Category'
    
def remove_punct(token):
    x = [i for i in token if i.isalnum()]
    return x

def remove_stopwords(token):
    x = [i for i in token if not i in stopwords]
    return x

def stemmer(token):
    porter = PorterStemmer()
    x = [porter.stem(i) for i in token]
    return x

def lemmatizer(token):
    lemmatizer = WordNetLemmatizer() 
    x = [lemmatizer.lemmatize(i) for i in token]
    return x

def join(token):
    x = [" ".join(i for i in token)]
    return x[0]

def encode(train,test):
    vectorizer = CountVectorizer()

    vectorizer = vectorizer.fit(train['brand_name'].values)
    brand = vectorizer.transform(test['brand_name'].values)
    
    vectorizer = vectorizer.fit(train['category_name'].values)
    category = vectorizer.transform(test['category_name'].values)
    
    vectorizer = vectorizer.fit(train['main_cat'].values)
    maincat = vectorizer.transform(test['main_cat'].values)
    
    vectorizer = vectorizer.fit(train['sub_cat1'].values)
    subcat1 = vectorizer.transform(test['sub_cat1'].values)
    
    vectorizer = vectorizer.fit(train['sub_cat2'].values)
    subcat2 = vectorizer.transform(test['sub_cat2'].values)
    
    vectorizer = vectorizer.fit(train['tokenized_name'].values)
    name = vectorizer.transform(test['tokenized_name'].values)
    
    vectorizer = vectorizer.fit(train['tokenized_description'].values)
    description = vectorizer.transform(test['tokenized_description'].values)

    return brand,category,maincat,subcat1,subcat2,name,description

def encode1(train,test,column,feature,ranges):
    
    vectorizer = TfidfVectorizer(ngram_range=ranges, max_features=feature).fit(train[column])
    transformed_text = vectorizer.transform(test[column])
    return transformed_text

def encode2(train,test,column,feature,ranges):
   
    vectorizer = CountVectorizer(ngram_range=ranges, max_features=feature).fit(train[column])
    transformed_text = vectorizer.transform(test[column])
    return transformed_text
    
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
    
def dummies(df):
    df['item_condition_id'] = df["item_condition_id"].astype("category")
    df['shipping'] = df["shipping"].astype("category")
    item_id_shipping = csr_matrix(pd.get_dummies(df[['item_condition_id', 'shipping']],sparse=True).values)
    return item_id_shipping

def original_price(value):
    return math.exp(value) - 1

In [None]:
train['main_cat'],train['sub_cat1'],train['sub_cat2'] = zip(*train['category_name'].apply(lambda x: split_categories(x)))
test['main_cat'],test['sub_cat1'],test['sub_cat2'] = zip(*test['category_name'].apply(lambda x: split_categories(x)))
train['log_price'] = np.log1p(train['price'])

In [None]:
word_counter = Counter(train['item_description'])
most_common_words = word_counter.most_common(500)

stopwords = get_stop_words('en')
stopwords.extend(['rm'])  
#it is mentioned that information given to us is in a formatted way and it doesn't show text given in 
#dollar terms instead we get rm in it's place so it is added as aditional stop words since it occurences have 
#no effect on our prices

#here is the link to original mercari dataset where it is explained
#https://www.kaggle.com/c/mercari-price-suggestion-challenge/data

wordcloud = WordCloud(stopwords=stopwords,background_color='white').generate(str(most_common_words))

plt.figure(figsize=(10,10))
plt.imshow(wordcloud,interpolation='bilinear')
plt.title('Word cloud of item description\n',fontsize=15)
plt.axis('off')
plt.show()

In [None]:
train['description_wc']=[len(str(i).split()) for i in train['item_description']]
test['description_wc']=[len(str(i).split()) for i in test['item_description']]
train['name_wc']=[len(str(i).split()) for i in train['name']]
test['name_wc']=[len(str(i).split()) for i in test['name']]

In [None]:
train['brand_name'] = train['brand_name'].fillna('Not Known')
train['item_description'] = train['item_description'].fillna('No Description Yet')
train['category_name'] = train['category_name'].fillna('Not Category')
test['category_name'] = test['category_name'].fillna('No Category')
test['brand_name'] = test['brand_name'].fillna('Not Known')
test['item_description'] = test['item_description'].fillna('No Description Yet')

In [None]:
#train['item_description'] = train['item_description'].apply(decontracted)
#test['item_description'] = test['item_description'].astype(str).apply(decontracted)
#train['name'] = train['name'].apply(decontracted)
#test['name']= test['name'].apply(decontracted)

#got better result without using them

In [None]:
train['tokenized_name'] = train.apply(lambda x:word_tokenize(str(x['name'])),axis = 1)
test['tokenized_name'] = test.apply(lambda x:word_tokenize(str(x['name'])),axis = 1)

In [None]:
train['tokenized_name'] = train['tokenized_name'].apply(remove_punct)
test['tokenized_name'] = test['tokenized_name'].apply(remove_punct)

In [None]:
train['tokenized_name'] = train['tokenized_name'].apply(remove_stopwords)
test['tokenized_name'] = test['tokenized_name'].apply(remove_stopwords)

In [None]:
train['tokenized_name'] = train['tokenized_name'].apply(join)
test['tokenized_name'] = test['tokenized_name'].apply(join)

In [None]:
train['tokenized_description'] = train.apply(lambda x:word_tokenize(str(x['item_description'])),axis = 1)
test['tokenized_description'] = test.apply(lambda x:word_tokenize(str(x['item_description'])),axis = 1)

In [None]:
train['tokenized_description'] = train['tokenized_description'].apply(remove_punct)
test['tokenized_description] = test['tokenized_description'].apply(remove_punct)

In [None]:
train['tokenized_description'] = train['tokenized_description'].apply(remove_stopwords)
test['tokenized_description'] = test['tokenized_description'].apply(remove_stopwords)

In [None]:
train['tokenized_description'] = train['tokenized_description'].apply(join)
test['tokenized_description'] = test['tokenized_description'].apply(join)

In [None]:
brand_train,cat_train,maincat_train,subcat1_train,subcat2_train,name_train,description_train = encode(train,train)
brand_test,cat_test,maincat_test,subcat1_test,subcat2_test,name_test,description_test  = encode(train,test)

In [None]:
item_id_shipping_train = dummies(train)
item_id_shipping_test = dummies(test)

In [None]:
x_train_set = hstack((brand_train,cat_train,maincat_train,subcat1_train,subcat2_train,item_id_shipping_train)).tocsr()
x_test_set = hstack((brand_test,cat_test,maincat_test,subcat1_test,subcat2_test,item_id_shipping_test)).tocsr()

In [None]:
X_train_tfidf = encode1(train,train,'tokenized_name',100000,(1,2))
X_test_tfidf = encode1(train,test,'tokenized_name',100000,(1,2))

In [None]:
X_train_tfidf1 = encode1(train,train,'tokenized_description',100000,(1,2))
X_test_tfidf1 = encode1(train,test,'tokenized_description',100000,(1,2))

In [None]:
x_train = hstack((X_train_tfidf,X_train_tfidf1,x_train_set,train['name_wc'].values.reshape(-1,1),train['description_wc'].values.reshape(-1,1))).tocsr()
x_test = hstack((X_test_tfidf,X_test_tfidf1,x_test_set,test['name_wc'].values.reshape(-1,1),test['description_wc'].values.reshape(-1,1))).tocsr()

In [None]:
y_train = train['log_price']

In [None]:
params = {'alpha':[0.0001,0.001,0.01,0.1,1.0,2.0,4.0,5.0,6.0]}
model_ridge = Ridge(
    solver='auto', fit_intercept=True,
    max_iter=100, normalize=False, tol=0.05, random_state = 1,
)
r_model = RandomizedSearchCV(model_ridge,params,scoring='neg_mean_squared_error',cv=2,verbose=2,n_jobs=-1,return_train_score=True)

In [None]:
r_model.fit(x_train,y_train)

In [None]:
r_model.best_params_   #get alpha value to use in next step

In [None]:
ridge_modelCV = RidgeCV(
    fit_intercept=True, alphas=[4.0],
    normalize=False, cv = 2, scoring='neg_mean_squared_error',
)
ridge_modelCV.fit(x_train, y_train)

In [None]:
pred=ridge_modelCV.predict(x_test)

In [None]:
df = pd.DataFrame(pred)
df[0] = df[0].apply(original_price)

In [None]:
test_id = test['id']
result = pd.concat([test_id,df],axis = 1)
result.set_index('id', inplace=True)
result.rename(columns = {0:'price'}, inplace = True) 

In [None]:
result.to_csv('/kaggle/working/submission_kaggle.csv') #now download submission_kaggle.csv and submit it