In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
train_data = pd.read_csv('train.tsv', sep='\t')
train_data.head(5)

In [None]:
train_data.isnull().sum()

In [None]:
print(train_data.shape)
print(train_data.columns)

In [None]:
train_data = train_data[train_data['price'] > 0].reset_index(drop=True)
train_data,validation_data=train_test_split(train_data,test_size=0.2,random_state=42)
print(train_data.shape)
print(validation_data.shape)

In [None]:
train_data.isnull().sum()

In [None]:
validation_data.isnull().sum()

In [None]:
train = train_data.copy()
valid = validation_data.copy()

In [None]:
def split_categories(category):
    try:
      sub_category1,sub_category2,sub_category3 = category.split("/")
      return sub_category1,sub_category2,sub_category3
    except:
      return ("No label","No label","No label")

def create_split_categories(data):
    data['sub_category1'],data['sub_category2'],data['sub_category3']=zip(*data['category_name'].\
                                                                  apply(lambda x: split_categories(x)))

In [None]:
create_split_categories(train_data)
create_split_categories(validation_data)

In [None]:
def fill_missing_values(data):
    data['category_name'].fillna('unknown_cat', inplace=True)
    data['brand_name'].fillna('unknown_brand', inplace=True)
    data['item_description'].fillna('unknown_description', inplace=True)
    return data

In [None]:
fill_missing_values(train_data)
fill_missing_values(validation_data)

In [None]:
test_data = pd.read_csv('test_stg2.tsv',sep='\t')
test = test_data.copy()

In [None]:
test_data.head(5)

In [None]:
test_data.shape

In [None]:
test_data.isnull().sum()

In [None]:
create_split_categories(test_data)
fill_missing_values(test_data)

In [None]:
from collections import Counter

train_cond_id = Counter(list(train_data['item_condition_id']))
val_cond_id = Counter(list(validation_data['item_condition_id']))

fig, (ax1,ax3) = plt.subplots(1,2, figsize=(15,8))

ax1.bar(train_cond_id.keys(), train_cond_id.values(), width=0.2, align='edge', label='Train')
ax1.set_xticks([1,2,3,4,5])
ax1.set_xlabel('item_condition_id')
ax1.legend()


ax3.bar(val_cond_id.keys(), val_cond_id.values(), width=-0.2, align='edge', label='Val')
ax3.set_xticks([1,2,3,4,5])
ax3.set_xlabel('item_condition_id')
ax3.legend()

fig.show()

In [None]:
train_data['log_prices']= np.log(train_data['price']+1)

In [None]:
validation_data['log_prices']= np.log(validation_data['price']+1)

In [None]:
sns.kdeplot(data=train_data['price'])
plt.title('Distribution of price')
plt.grid(True)

In [None]:
sns.kdeplot(data=train_data['log_prices'])
plt.title('Distribution of log_prices')
plt.grid(True)

In [None]:
train_data.head(3)

In [None]:
train_category_name = Counter(list(train_data['category_name']))
val_category_name = Counter(list(validation_data['category_name']))
test_category_name = Counter(list(test_data['category_name']))

In [None]:
print("Top 10 categories in train data: ")
train_category_name.most_common(10)

In [None]:
print("Top 10 categories in validation data: ")
val_category_name.most_common(10)

In [None]:
print("Top 10 categories in test data: ")
test_category_name.most_common(10)

In [None]:
train_brand_name = Counter(list(train_data['brand_name']))
val_brand_name = Counter(list(validation_data['brand_name']))
test_brand_name = Counter(list(test_data['brand_name']))

In [None]:
print("Top 10 brands in train data: ")
train_brand_name.most_common(15)

In [None]:
print("Top 10 brands in validation data: ")
val_brand_name.most_common(15)

In [None]:
print("Top 10 brands in test data: ")
test_brand_name.most_common(15)

#**Text processing**

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
#remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')

def remove_stop_words(x):
    x = ' '.join([i for i in x.lower().split(' ') if i not in stop])
    return x

In [None]:
train_data['item_description'] = train_data['item_description'].apply(remove_stop_words)
validation_data['item_description'] = validation_data['item_description'].apply(remove_stop_words)
test_data['item_description'] = test_data['item_description'].apply(remove_stop_words)

In [None]:
from tqdm import tqdm
import re

def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def text_preprocessing(text_col):
  preprocessed_total = []
  for sentence in tqdm(text_col.values):
    sent = decontracted(sentence)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    preprocessed_total.append(sent.lower().strip())
  return preprocessed_total

In [None]:
train_data['item_description']=text_preprocessing(train_data['item_description'])
validation_data['item_description']=text_preprocessing(validation_data['item_description'])
test_data['item_description']=text_preprocessing(test_data['item_description'])

train_data['name']=text_preprocessing(train_data['name'])
validation_data['name']=text_preprocessing(validation_data['name'])
test_data['name']=text_preprocessing(test_data['name'])

In [None]:
print(train_data['item_description'].iloc[33],len(train_data['item_description'].iloc[33].split(' ')))
print(train['item_description'].iloc[33],len(train['item_description'].iloc[33].split(' ')))

In [None]:
def clean_category(cate_col):
    
    

    cate_list = []
    for i in tqdm(cate_col.values):
        i = re.sub('[^A-Za-z0-9]+', ' ', i)
        i = i.replace(' ','')
        i = i.replace('&','_')
        cate_list.append(i.strip())
    
    return cate_list

In [None]:
train_data['sub_category1'] = clean_category(train_data['sub_category1'])
validation_data['sub_category1'] = clean_category(validation_data['sub_category1'])
test_data['sub_category1'] = clean_category(test_data['sub_category1'])

train_data['sub_category2'] = clean_category(train_data['sub_category2'])
validation_data['sub_category2'] = clean_category(validation_data['sub_category2'])
test_data['sub_category2'] = clean_category(test_data['sub_category2'])

train_data['sub_category3'] = clean_category(train_data['sub_category3'])
validation_data['sub_category3'] = clean_category(validation_data['sub_category3'])
test_data['sub_category3'] = clean_category(test_data['sub_category3'])

In [None]:
#brand name processing
train_data['brand_name'] = clean_category(train_data['brand_name'])
validation_data['brand_name'] = clean_category(validation_data['brand_name'])
test_data['brand_name'] = clean_category(test_data['brand_name'])

#**Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countvectorizer=CountVectorizer().fit(train_data['sub_category1'])                 
bow_cat1_train=countvectorizer.transform(train_data['sub_category1'])
bow_cat1_val=countvectorizer.transform(validation_data['sub_category1'])
bow_cat1_test=countvectorizer.transform(test_data['sub_category1'])
# print("After Vectorization of sub category1 feature: ")
# print(bow_cat1_train.shape)
# print(bow_cat1_val.shape)
# print(bow_cat1_test.shape)
# print("Some Features are: ")
# print(countvectorizer.get_feature_names())

countvectorizer=CountVectorizer().fit(train_data['sub_category2'])   
bow_cat2_train=countvectorizer.transform(train_data['sub_category2'])
bow_cat2_val=countvectorizer.transform(validation_data['sub_category2'])
bow_cat2_test=countvectorizer.transform(test_data['sub_category2'])
# print("After Vectorization of sub category2 feature: ")
# print(bow_cat2_train.shape)
# print(bow_cat2_val.shape)
# print(bow_cat2_test.shape)
# print("Some Features are: ")
# print(countvectorizer.get_feature_names()[50:60])

countvectorizer=CountVectorizer().fit(train_data['sub_category3'])   
bow_cat3_train=countvectorizer.transform(train_data['sub_category3'])
bow_cat3_val=countvectorizer.transform(validation_data['sub_category3'])
bow_cat3_test=countvectorizer.transform(test_data['sub_category3'])
# print("After Vectorization of sub category3 feature: ")
# print(bow_cat3_train.shape)
# print(bow_cat3_val.shape)
# print(bow_cat3_test.shape)
# print("Some Features are: ")
# print(countvectorizer.get_feature_names()[200:210])

In [None]:
print('cat1 train shape:',bow_cat1_train.shape)
print('cat1 validation shape:',bow_cat1_val.shape)
print('cat1 test shape:',bow_cat1_test.shape)
print('cat2 train shape:',bow_cat2_train.shape)
print('cat2 validation shape:',bow_cat2_val.shape)
print('cat2 test shape:',bow_cat2_test.shape)
print('cat3 train shape:',bow_cat3_train.shape)
print('cat3 validation shape:',bow_cat3_val.shape)
print('cat3 test shape:',bow_cat3_test.shape)

In [None]:
train_data.head(3)

In [None]:
countvectorizer=CountVectorizer().fit(train_data['brand_name'])  
bow_brand_train=countvectorizer.transform(train_data['brand_name'])
bow_brand_val=countvectorizer.transform(validation_data['brand_name'])
bow_brand_test=countvectorizer.transform(test_data['brand_name'])
# print("After Vectorization of brand_name feature: ")
# print(bow_brand_train.shape)
# print(bow_brand_val.shape)
# print(bow_brand_test.shape)
# print("Some Features are: ")
# print(countvectorizer.get_feature_names()[35:45])

In [None]:
bow_brand_train.toarray()

In [None]:
countvectorizer=CountVectorizer(min_df=10).fit(train_data['name'])  
bow_name_train=countvectorizer.transform(train_data['name'])
bow_name_val=countvectorizer.transform(validation_data['name'])
bow_name_test=countvectorizer.transform(test_data['name'])
# print("After Vectorization of name feature: ")
# print(bow_name_train.shape)
# print(bow_name_val.shape)
# print(bow_name_test.shape)
# print("Some Features are: ")
# print(countvectorizer.get_feature_names()[210:220])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvectorizer=TfidfVectorizer(ngram_range=(1,2),min_df=10,max_features=5000).fit(train_data['item_description']) 
tfidf_description_train=tfidfvectorizer.transform(train_data['item_description'])
tfidf_description_val=tfidfvectorizer.transform(validation_data['item_description'])
tfidf_description_test=tfidfvectorizer.transform(test_data['item_description'])
# print("After Vectorization of item description feature: ")
# print(tfidf_description_train.shape)
# print(tfidf_description_val.shape)
# print(tfidf_description_test.shape)
# print("Some Features are: ")
# print(tfidfvectorizer.get_feature_names()[222:234])

In [None]:
from scipy.sparse import csr_matrix

features_train = csr_matrix(pd.get_dummies(train_data[['item_condition_id', 'shipping']],sparse=True).values)
features_val = csr_matrix(pd.get_dummies(validation_data[['item_condition_id', 'shipping']],sparse=True).values)
features_test = csr_matrix(pd.get_dummies(test_data[['item_condition_id', 'shipping']],sparse=True).values)
print(features_train.shape)
print(features_val.shape)
print(features_test.shape)

In [None]:
from scipy.sparse import hstack
X_train=hstack((bow_cat1_train,bow_cat2_train,bow_cat3_train,bow_brand_train,bow_name_train,tfidf_description_train,features_train)).tocsr()
X_val=hstack((bow_cat1_val,bow_cat2_val,bow_cat3_val,bow_brand_val,bow_name_val,tfidf_description_val,features_val)).tocsr()
X_test=hstack((bow_cat1_test,bow_cat2_test,bow_cat3_test,bow_brand_test,bow_name_test,tfidf_description_test,features_test)).tocsr()
print("Shape of train data: ",X_train.shape) 
print("Shape of cv data: ",X_val.shape)   
print("Shape of test data: ",X_test.shape)   

#**Model 1:Linear regression**

In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import mean_squared_log_error

# linearregression=LinearRegression(normalize=True)
# linearregression.fit(X_train,train_data['log_prices'])  
# ytrain_predict=linearregression.predict(X_train)
# yval_predict=linearregression.predict(X_val)
# train_error=np.sqrt(mean_squared_error(train_data['log_prices'],ytrain_predict))
# val_error=np.sqrt(mean_squared_error(validation_data['log_prices'],yval_predict))
# print("RMSLE on train is {} RMSLE on cv is {}".format(train_error,val_error))



In [None]:
# yval_linear=linearregression.predict(X_val)
# ytest_linear=linearregression.predict(X_test)

In [None]:
# %matplotlib inline

# fig, ax = plt.subplots(1, 2,figsize=(16, 8))



# sns.regplot(ax=ax[0],x=train_data['log_prices'][40:80],y=ytrain_predict[40:80],marker="+")
# sns.regplot(ax=ax[1],x=validation_data['log_prices'][40:80],y=yval_predict[40:80],color="g",marker="+")



In [None]:
# from sklearn.linear_model import RidgeCV

# ridge_cv = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0), cv=3)
# ridge_cv.fit(X_train, train_data['log_prices'])
# yval_predict = ridge_cv.predict(X_val)
# print("Optimal alpha:",ridge_cv.alpha_)
# ridge_RMSLE = np.sqrt(mean_squared_error(validation_data['log_prices'],yval_predict))
# print("RMSLE:",ridge_RMSLE)

In [None]:

# ytrain_predict = ridge_cv.predict(X_train)
# print("Optimal alpha:",ridge_cv.alpha_)
# ridge_RMSLE = np.sqrt(mean_squared_error(train_data['log_prices'],ytrain_predict))
# print("RMSLE:",ridge_RMSLE)

In [None]:
# yval_ridge=ridge_cv.predict(X_val)
# ytest_ridge=ridge_cv.predict(X_test)

In [None]:
# submission_data = pd.read_csv('sample_submission_stg2.csv')
# submission_data.head(5)

In [None]:
# submission_data.shape

In [None]:
# submission_data.loc[:, 'price'] = np.expm1(ytest_ridge)

In [None]:
# submission_data.head(5)

In [None]:
# submission_data.to_csv('submission.csv', index=False)

In [None]:
# train_data = train[['name', 'price', 'item_condition_id', 'brand_name', 'shipping', 'item_description', 'cat_1', 'cat_2', 'cat_3']]
# test = test[['name', 'item_condition_id', 'brand_name', 'shipping', 'item_description', 'cat_1', 'cat_2', 'cat_3']]