## import statements , Authentication and installing packages.

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed
import string
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import time
from sklearn.preprocessing import OneHotEncoder

In [0]:
  !pip install nltk



In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
#for connecting to google drive
!pip install -U -q PyDrive              

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Reading Dataset

In [0]:
link_train = 'https://drive.google.com/open?id=1V3SSQd5YcoLfJ8_0CTT4k_4-LKGn91iT'   #Shareable link for train.tsv in Google Drive 

In [0]:
fluff, train_id = link_train.split('=')
print (train_id) 

1V3SSQd5YcoLfJ8_0CTT4k_4-LKGn91iT


In [0]:
downloaded_train = drive.CreateFile({'id':train_id}) 
downloaded_train.GetContentFile('train.tsv') 

In [0]:
data_train = pd.read_table('train.tsv', engine='c')
print('Train size:',len(data_train))


Train size: 1482535


## EDA on Dataset 

In [0]:
data_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [0]:
data_train.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description'],
      dtype='object')

In [0]:
data_train.shape

(1482535, 8)

In [0]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
train_id             1482535 non-null int64
name                 1482535 non-null object
item_condition_id    1482535 non-null int64
category_name        1476208 non-null object
brand_name           849853 non-null object
price                1482535 non-null float64
shipping             1482535 non-null int64
item_description     1482531 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [0]:
data_train.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.73752,0.4472744
std,427971.1,0.9031586,38.58607,0.4972124
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.0,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [0]:
data_train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [0]:
len(data_train[data_train['item_description'] == 'No description yet'])

82489

## Price Column

In [0]:
# log transformation of price variable, to have a normal distribution.
log_transform_price = np.log(data_train['price'] + 1)
log_transform_price.describe()

count    1.482535e+06
mean     2.979059e+00
std      7.492094e-01
min      0.000000e+00
25%      2.397895e+00
50%      2.890372e+00
75%      3.401197e+00
max      7.605890e+00
Name: price, dtype: float64

In [0]:
data_train['log_transform_price'] = log_transform_price

## Description and Price_Category Statistics

In [0]:
import re
import math
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [0]:
print('Generating features with statistics for item description textual content')

acronyms_regex = re.compile('([A-Z\-0-9]{2,})')
hashtag_regex = re.compile(r'(#[a-z]{2,})')

#Extracts statistics for each description, words lengths, like percentage of upper-case words, hashtags, etc
def extract_counts(text):
    text_size_words_counts = len(text.split(' '))
    text_size_words_log_counts = math.log1p(text_size_words_counts)
    full_uppercase_perc = len(acronyms_regex.findall(text)) / float(text_size_words_counts)
    exclamation_log_count = math.log1p(text.count('!'))
    star_log_count = math.log1p(text.count('*'))
    percentage_log_count = math.log1p(text.count('%'))
    price_removed_marker_log_count = math.log1p(text.count('[rm]'))
    hashtag_log_count = math.log1p(len(hashtag_regex.findall(text)))    
    return [text_size_words_log_counts,
            full_uppercase_perc,
            exclamation_log_count,
            star_log_count,            
            percentage_log_count,
            price_removed_marker_log_count,
            hashtag_log_count]



Generating features with statistics for item description textual content


In [0]:
item_descr_counts = np.vstack(data_train['item_description'].astype(str).apply(extract_counts).values)

item_descr_counts_scaler = StandardScaler(copy=True)
desc_stats = item_descr_counts_scaler.fit_transform(item_descr_counts)

In [0]:
del(item_descr_counts)

#### Price Statistics

In [0]:
train_idxs = np.arange(0,len(data_train))

In [0]:
print('Generating features from category statistics for price ...')

CAT_STATS_MIN_COUNT = 5
STD_SIGMAS = 2

# df_train['price_log'] = price_log
cats_stats_df = data_train.iloc[train_idxs].groupby(['category_name', 'brand_name', 'shipping']).agg({'category_name': len,
                                                     'log_transform_price': [np.median, np.mean, np.std]})
cats_stats_df.columns = ['count','price_log_median', 'price_log_mean', 'price_log_std']
#Removing categories without a minimum threshold of samples, to avoid price data leak 
cats_stats_df.drop(cats_stats_df[cats_stats_df['count'] < CAT_STATS_MIN_COUNT].index, inplace=True)
cats_stats_df['price_log_std'] = cats_stats_df['price_log_std'].fillna(0)
cats_stats_df['price_log_conf_variance'] = cats_stats_df['price_log_std'] / cats_stats_df['price_log_mean']
cats_stats_df['count_log'] = np.log1p(cats_stats_df['count'])
cats_stats_df['min_expected_log_price'] = (cats_stats_df['price_log_mean'] - cats_stats_df['price_log_std']*STD_SIGMAS).clip(lower=1.0)
cats_stats_df['max_expected_log_price'] = (cats_stats_df['price_log_mean'] + cats_stats_df['price_log_std']*STD_SIGMAS)




Generating features from category statistics for price ...


In [0]:
def merge_with_cat_stats(df):
    return df.merge(cats_stats_df.reset_index(), how='left', 
            on=['category_name', 'brand_name', 'shipping'])[['price_log_median', 'price_log_mean', 'price_log_std', 
                                               'price_log_conf_variance', 'count_log', 'min_expected_log_price', 'max_expected_log_price']].fillna(0).values

cats_stats_features_scaler = StandardScaler(copy=True)
price_cat_stats = cats_stats_features_scaler.fit_transform(merge_with_cat_stats(data_train))

In [0]:
#Joining the dense features
price_cat_desc_stats = np.hstack([desc_stats, price_cat_stats])

In [0]:
del(desc_stats)
del(price_cat_stats)

In [0]:
price_cat_desc_stats.shape

(1482535, 14)

In [0]:
price_cat_desc_stats_df = pd.DataFrame(price_cat_desc_stats, columns=['words_log_counts','uppercase_perc','exclamation_log_count','star_log_count','percentage_log_count',
            'rm_log_count','hashtag_log_count','price_log_median', 'price_log_mean', 'price_log_std', 'price_log_conf_variance',
            'count_log', 'min_expected_log_price', 'max_expected_log_price'])

In [0]:
data_train = pd.concat([data_train,price_cat_desc_stats_df], axis=1)

In [0]:
data_train.shape

(1482535, 23)

## Item Category or Category Name

In [0]:
data_train['category_name'].value_counts()[:5]

Women/Athletic Apparel/Pants, Tights, Leggings    60177
Women/Tops & Blouses/T-Shirts                     46380
Beauty/Makeup/Face                                34335
Beauty/Makeup/Lips                                29910
Electronics/Video Games & Consoles/Games          26557
Name: category_name, dtype: int64

In [0]:
print("There are %d unique main categories." % data_train['category_name'].nunique())

There are 1287 unique main categories.


In [0]:
data_train[data_train['category_name'] == 'Category Unknown']

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,log_transform_price,words_log_counts,uppercase_perc,exclamation_log_count,star_log_count,percentage_log_count,rm_log_count,hashtag_log_count,price_log_median,price_log_mean,price_log_std,price_log_conf_variance,count_log,min_expected_log_price,max_expected_log_price


In [0]:
data_train['category_name'].isnull().sum()

6327

In [0]:
def split_category(text):
    try: return text.split("/")
    except: return ("Category Unknown", "Category Unknown", "Category Unknown")
    
data_train['main_category'], data_train['subcategory_1'], data_train['subcategory_2'] = zip(*data_train['category_name'].apply(lambda x: split_category(x)))
data_train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,log_transform_price,words_log_counts,uppercase_perc,exclamation_log_count,star_log_count,percentage_log_count,rm_log_count,hashtag_log_count,price_log_median,price_log_mean,price_log_std,price_log_conf_variance,count_log,min_expected_log_price,max_expected_log_price,main_category,subcategory_1,subcategory_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,2.397895,-1.435819,-0.463781,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3.970292,0.827068,-0.281394,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,1.382559,1.348661,0.670262,0.367732,-0.067626,1.623006,1.189453,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,2.397895,0.613741,-0.010959,1.63831,-0.201971,-0.216047,-0.287195,-0.061017,0.47669,0.467825,0.339902,0.732595,-0.159609,0.508109,0.440447,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3.583519,0.71069,-0.463781,-0.514153,-0.201971,-0.216047,2.19836,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3.806662,-1.02338,-0.463781,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,Women,Jewelry,Necklaces


##  Brand

In [0]:
data_train['brand_name'].isnull().sum()

632682

**Filling Null Brands**

In [0]:
data_train.brand_name.fillna('unk_brand', inplace=True) #Replaceing all the null values with unk_brand

In [0]:
# Creating a dict of brand names as keys and category list as values.

# Returning Unique List.
def concat_categories(x):
    return set(x.values)

#Getting unique categories for each brand as a dict 
brand_names_categories = dict(data_train[data_train['brand_name'] != 'unk_brand'][['brand_name','category_name']].astype('str').groupby('brand_name').agg(concat_categories).reset_index().values.tolist())
# print(brand_names_categories)


In [0]:
#Validating unique categories for each brand as a dict 
data_train[data_train['brand_name'] == '% Pure']['category_name'].unique()

array(['Beauty/Makeup/Lips', 'Beauty/Makeup/Face',
       'Beauty/Skin Care/Body', 'Beauty/Makeup/Makeup Palettes',
       'Beauty/Skin Care/Face', 'Beauty/Fragrance/Candles & Home Scents',
       'Beauty/Hair Care/Shampoo & Conditioner Sets',
       'Beauty/Makeup/Eyes', 'Beauty/Bath & Body/Bath',
       'Beauty/Bath & Body/Scrubs & Body Treatments',
       'Beauty/Skin Care/Maternity', 'Beauty/Skin Care/Feet',
       'Beauty/Fragrance/Men', 'Beauty/Makeup/Makeup Sets',
       'Beauty/Hair Care/Hair & Scalp Treatments',
       'Beauty/Hair Care/Hair Loss Products', 'Beauty/Fragrance/Kids',
       'Beauty/Fragrance/Women', 'Beauty/Skin Care/Eyes',
       'Beauty/Skin Care/Sets & Kits'], dtype=object)

In [0]:
#Brands sorted by length (decreasinly), so that longer brand names have precedence in the null brand search
brands_sorted_by_size = list(sorted(filter(lambda y: len(y) >= 3, list(brand_names_categories.keys())), key = lambda x: -len(x)))

In [0]:
#Count of unknow brand in the dataset.
brand_name_null_count = len(data_train.loc[data_train['brand_name'] == 'unk_brand'])

In [0]:
brand_name_null_count

632682

In [0]:
#Try to guess the Brand based on Name and Category. Returning brand name if brand is 'name' and category in 'brand_names_categories'.
def brandfinder(name, category):    
    for brand in brands_sorted_by_size:
        if brand in name and category in brand_names_categories[brand]:
          
            return brand
        
    return 'unk_brand'


In [0]:
train_names_unknown_brands = data_train[data_train['brand_name'] == 'unk_brand'][['name','category_name']].astype('str').values

In [0]:
len(train_names_unknown_brands)

632682

In [0]:
train_estimated_brands = []
for name, category in train_names_unknown_brands:
  train_estimated_brands.append(brandfinder(name,category))

In [0]:

# train_estimated_brands = Parallel(n_jobs=1)(delayed(brandfinder)(name, category) for name, category in train_names_unknown_brands) #Returns generator object.
#print(train_estimated_brands)
data_train.loc[data_train['brand_name'] == 'unk_brand', 'brand_name'] = train_estimated_brands

found = brand_name_null_count-len(data_train.loc[data_train['brand_name'] == 'unk_brand'])
print("Null brands found: %d from %d" % (found, brand_name_null_count))

Null brands found: 87216 from 632682


In [0]:
data_train['brand_name'].isnull().sum()

0

In [0]:
data_train[data_train['brand_name'] == 'unk_brand'].shape

(545466, 26)

In [0]:
data_train.drop(columns={'price','train_id','category_name'},inplace=True)

In [0]:
data_train.columns

Index(['name', 'item_condition_id', 'brand_name', 'shipping',
       'item_description', 'log_transform_price', 'words_log_counts',
       'uppercase_perc', 'exclamation_log_count', 'star_log_count',
       'percentage_log_count', 'rm_log_count', 'hashtag_log_count',
       'price_log_median', 'price_log_mean', 'price_log_std',
       'price_log_conf_variance', 'count_log', 'min_expected_log_price',
       'max_expected_log_price', 'main_category', 'subcategory_1',
       'subcategory_2'],
      dtype='object')

In [0]:
del(brand_name_null_count)
del(found)
del(brand_names_categories)


In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()

In [0]:
data_train['main_category'] = labelencoder_X.fit_transform(data_train['main_category'])

In [0]:
data_train['subcategory_1'] = labelencoder_X.fit_transform(data_train['subcategory_1'])

In [0]:
data_train['subcategory_2'] = labelencoder_X.fit_transform(data_train['subcategory_2'])

In [0]:
data_train['brand_name'] = labelencoder_X.fit_transform(data_train['brand_name'])

In [0]:
data_train.head()

Unnamed: 0,name,item_condition_id,brand_name,shipping,item_description,log_transform_price,words_log_counts,uppercase_perc,exclamation_log_count,star_log_count,percentage_log_count,rm_log_count,hashtag_log_count,price_log_median,price_log_mean,price_log_std,price_log_conf_variance,count_log,min_expected_log_price,max_expected_log_price,main_category,subcategory_1,subcategory_2
0,MLB Cincinnati Reds T Shirt Size XL,3,2670,1,No description yet,2.397895,-1.435819,-0.463781,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,6,103,763
1,Razer BlackWidow Chroma Keyboard,3,3557,0,This keyboard is in great condition and works ...,3.970292,0.827068,-0.281394,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,1.382559,1.348661,0.670262,0.367732,-0.067626,1.623006,1.189453,2,31,213
2,AVA-VIV Blouse,1,4180,1,Adorable top with a hint of lace and a key hol...,2.397895,0.613741,-0.010959,1.63831,-0.201971,-0.216047,-0.287195,-0.061017,0.47669,0.467825,0.339902,0.732595,-0.159609,0.508109,0.440447,10,104,94
3,Leather Horse Statues,1,4807,1,New with tags. Leather horses. Retail for [rm]...,3.583519,0.71069,-0.463781,-0.514153,-0.201971,-0.216047,2.19836,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,4,56,405
4,24K GOLD plated rose,1,4807,0,Complete with certificate of authenticity,3.806662,-1.02338,-0.463781,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,10,59,537


In [0]:
data_train.columns

Index(['name', 'item_condition_id', 'brand_name', 'shipping',
       'item_description', 'log_transform_price', 'words_log_counts',
       'uppercase_perc', 'exclamation_log_count', 'star_log_count',
       'percentage_log_count', 'rm_log_count', 'hashtag_log_count',
       'price_log_median', 'price_log_mean', 'price_log_std',
       'price_log_conf_variance', 'count_log', 'min_expected_log_price',
       'max_expected_log_price', 'main_category', 'subcategory_1',
       'subcategory_2'],
      dtype='object')

## Word2vec on item desc 

In [0]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
import re
# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
def decontracted(phrase):
   # specific
   phrase = re.sub(r"won't", "will not", phrase)
   phrase = re.sub(r"can\'t", "can not", phrase)

   # general
   phrase = re.sub(r"n\'t", " not", phrase)
   phrase = re.sub(r"\'re", " are", phrase)
   phrase = re.sub(r"\'s", " is", phrase)
   phrase = re.sub(r"\'d", " would", phrase)
   phrase = re.sub(r"\'ll", " will", phrase)
   phrase = re.sub(r"\'t", " not", phrase)
   phrase = re.sub(r"\'ve", " have", phrase)
   phrase = re.sub(r"\'m", " am", phrase)
   return phrase

In [0]:
!pip install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/d1/dd/112bd4258cee11e0baaaba064060eb156475a42362e59e3ff28e7ca2d29d/gensim-3.8.1-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 368kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.1


In [0]:
 from nltk.corpus import stopwords
 from gensim.models import Word2Vec

In [0]:
stop = set(stopwords.words('english')) 

In [0]:
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '\\"\\r\\t\\n]') #Removing Quotations,Carraige return,newline char, tab.
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens)) # Remove stop words
        filtered_tokens = [decontracted(w) for w in tokens if re.search('[a-zA-Z0-9]', w)]
        filtered_tokens = [w.lower().strip() for w in filtered_tokens if len(w)>=3] # Changing to lower case and strpping the word , Removing words with lenght less than 3.
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)


In [0]:
data_train['item_description'].isnull().sum() #Checking Null Values

4

In [0]:
data_train['item_description'].fillna('No description yet',inplace = True) #Replacing Null Values with "No description yet"

In [0]:
data_train['item_description'].isnull().sum() #Crosschecking Null values

0

In [0]:
from tqdm import tqdm

data_train['clean_description'] = tqdm(data_train['item_description'].map(tokenize))

100%|██████████| 1482535/1482535 [00:00<00:00, 2768940.99it/s]


In [0]:
# Create a list of strings, where each string is an article title
description_list = [desc for desc in data_train['item_description']]

In [0]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.mean(word2vec_model[doc], axis=0)

# Here, we need each document to remain a document 
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stopwords]
    doc = [word for word in doc if word.isalpha()] 
    return doc

# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.wv.vocab for word in doc)

# Filter out documents
def filter_docs(corpus, texts, condition_on_doc):
    """
    Filter corpus and texts given the function condition_on_doc which takes
    a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts)

In [0]:
description_list[:5]

['No description yet',
 'This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.',
 'Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!',
 'New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage',
 'Complete with certificate of authenticity']

In [0]:
corpus = list(data_train['clean_description'])

In [0]:
len(corpus)

1482535

In [0]:
# train model
model = Word2Vec(data_train['clean_description'], min_count=10, size= 350)

In [0]:
len(model.wv.vocab)

34588

In [0]:
# from google.colab import drive
# drive.mount('/content/gdrive',force_remount=True)

In [0]:
# cd '/content/gdrive/My Drive/Data'

In [0]:
# import os
# # Set input directory, change working directory
# inDir = '/content/gdrive/My Drive/Data' + os.sep  # Set input directory to the current working directory
# os.chdir(inDir)               # Change to working directory

In [0]:
# #serializing our model to a file called model.pkl
# import pickle
# pickle.dump(model, open(inDir+"/Word2Vec_desc.pkl","wb"))


In [0]:
# summarize vocabulary
words = list(model.wv.vocab)

In [0]:
len(words)

34588

In [0]:
# Remove docs that don't include any words in W2V's vocab
corpus, description_list = filter_docs(corpus, description_list, lambda doc: has_vector_representation(model, doc))

3712 docs removed


In [0]:
# # Filter out any empty docs
corpus, description_list = filter_docs(corpus, description_list, lambda doc: (len(doc) != 0))

0 docs removed


In [0]:
# Initialize an array for the size of the corpus
x_desc = []
for doc in corpus: # append the vector for each document
    x_desc.append(document_vector(model, doc))
    


In [0]:
X_desc = np.array(x_desc) # list to array

In [0]:
type(X_desc)

numpy.ndarray

In [0]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc_desc = StandardScaler()
X_desc = sc_desc.fit_transform(X_desc)

In [0]:
from sklearn.decomposition import PCA

pca_desc = PCA(n_components=120, random_state=10)

# X is the array with our 350-dimensional vectors
reduced_vecs_desc = pca_desc.fit_transform(X_desc)

In [0]:
np.sum(pca_desc.explained_variance_ratio_)

0.8870151973897038

In [0]:
df_w_vectors_desc = pd.DataFrame(reduced_vecs_desc)

In [0]:
for i in range(len(df_w_vectors_desc.columns)):
  df_w_vectors_desc.rename(columns={i:'desc_' + str(i)},inplace=True)

In [0]:
main_w_vectors = pd.concat((data_train,df_w_vectors_desc), axis=1)

In [0]:
# Get rid of vectors that couldn't be matched with the main_df
main_w_vectors.dropna(axis=0, inplace=True)

main_w_vectors.head(4)

Unnamed: 0,name,item_condition_id,brand_name,shipping,item_description,log_transform_price,words_log_counts,uppercase_perc,exclamation_log_count,star_log_count,percentage_log_count,rm_log_count,hashtag_log_count,price_log_median,price_log_mean,price_log_std,price_log_conf_variance,count_log,min_expected_log_price,max_expected_log_price,main_category,subcategory_1,subcategory_2,clean_description,desc_0,desc_1,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9,desc_10,desc_11,desc_12,desc_13,desc_14,desc_15,...,desc_80,desc_81,desc_82,desc_83,desc_84,desc_85,desc_86,desc_87,desc_88,desc_89,desc_90,desc_91,desc_92,desc_93,desc_94,desc_95,desc_96,desc_97,desc_98,desc_99,desc_100,desc_101,desc_102,desc_103,desc_104,desc_105,desc_106,desc_107,desc_108,desc_109,desc_110,desc_111,desc_112,desc_113,desc_114,desc_115,desc_116,desc_117,desc_118,desc_119
0,MLB Cincinnati Reds T Shirt Size XL,3,2670,1,No description yet,2.397895,-1.435819,-0.463781,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,6,103,763,"[description, yet]",9.84202,-11.486503,-3.520439,-2.220166,-10.527919,2.234902,-1.910435,1.706773,-0.362397,0.520333,0.146969,0.129355,-0.567337,0.306127,-0.277588,0.922844,...,-0.028503,0.004085,-0.021844,-0.067789,0.042742,0.034206,-0.02543,-0.024604,-0.008309,0.030425,0.005044,0.009052,0.055379,0.022141,-0.014498,0.014219,0.032253,-0.023225,0.000518,-0.046104,0.011702,0.012552,0.013088,0.020372,0.011158,-0.038248,-0.030518,-0.000418,-0.022221,-0.020727,-0.026389,0.011695,-0.025739,-0.018654,0.007785,-0.022853,0.029405,0.019609,-0.007999,0.018744
1,Razer BlackWidow Chroma Keyboard,3,3557,0,This keyboard is in great condition and works ...,3.970292,0.827068,-0.281394,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,1.382559,1.348661,0.670262,0.367732,-0.067626,1.623006,1.189453,2,31,213,"[keyboard, great, condition, works, like, came...",5.07797,0.609434,8.452345,-3.698251,1.349674,1.624934,1.034534,-3.599739,0.964364,0.385444,3.802707,-0.003164,-1.360943,1.394307,3.018989,0.154816,...,-0.223059,-0.664977,-0.399142,-0.200006,-0.589547,0.365861,0.431848,0.310675,-1.26374,0.170311,0.176919,-0.094442,0.718282,0.098416,-0.298271,-0.521123,-0.298386,-0.709927,0.739374,-1.039741,-0.634561,-0.732742,-0.42745,-0.684142,0.31811,0.020955,0.04947,-0.129237,0.261078,0.145693,-0.64784,-0.52086,-0.681478,-0.706819,0.067754,-0.422033,0.725991,-0.105437,-0.48456,0.097155
2,AVA-VIV Blouse,1,4180,1,Adorable top with a hint of lace and a key hol...,2.397895,0.613741,-0.010959,1.63831,-0.201971,-0.216047,-0.287195,-0.061017,0.47669,0.467825,0.339902,0.732595,-0.159609,0.508109,0.440447,10,104,94,"[adorable, top, hint, lace, key, hole, back, p...",-5.251092,-4.758384,-4.926702,-1.778259,5.85339,4.831553,1.337444,3.458719,-2.616437,0.047622,-1.078015,1.633128,-2.27631,1.950964,-0.508073,-1.413029,...,0.62425,0.469235,-0.32063,-0.128002,0.448073,0.797282,1.442032,0.207243,0.641649,-1.496923,0.246524,-0.823957,0.263986,-0.295695,0.073548,-0.222687,-0.382156,1.15098,-0.675416,-0.536692,-0.23365,-0.622953,-0.239287,-0.635542,-0.059475,-0.396394,-0.163471,-0.306831,0.165443,1.055764,-0.413534,0.362749,-0.299999,1.168596,-0.616425,0.304615,-1.019968,-0.91803,-0.29368,-0.367551
3,Leather Horse Statues,1,4807,1,New with tags. Leather horses. Retail for [rm]...,3.583519,0.71069,-0.463781,-0.514153,-0.201971,-0.216047,2.19836,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,4,56,405,"[new, tags, leather, horses, retail, stand, fo...",3.188745,1.649122,1.87519,4.062757,-0.625134,2.105159,0.3086,-1.347356,-1.50942,0.568095,-0.493427,0.055428,1.753572,-0.111314,-0.00043,0.659424,...,-0.63664,1.019804,0.06868,-0.099139,-0.64096,-0.016385,-0.300839,-0.524527,0.212483,-0.619008,0.308007,-0.077578,0.244838,-1.035328,-0.215727,-0.66725,-0.437636,0.374459,0.170085,0.501118,0.213649,0.394042,-0.205296,-1.594879,0.60465,0.151598,0.252996,0.281229,-0.144893,-0.691194,0.116838,-0.388671,0.257211,1.023798,0.688899,0.84247,-0.310458,-0.106074,-0.370539,0.091064


In [0]:
main_w_vectors.drop(columns=['item_description','clean_description'],inplace= True)

In [0]:
type(main_w_vectors)

pandas.core.frame.DataFrame

## word2vec on name

In [0]:
main_w_vectors['clean_name'] = tqdm(main_w_vectors['name'].map(tokenize))

100%|██████████| 1478823/1478823 [00:00<00:00, 3489485.20it/s]


In [0]:
# Create a list of strings, where each string is an article title
name_list = [name for name in main_w_vectors['name']]

In [0]:
# Preprocess the corpus
# corpus_name = [preprocess(name) for name in name_list]

In [0]:
corpus_name = list(main_w_vectors['clean_name'])

In [0]:
len(corpus_name)

1478823

In [0]:
# train model
model_name = Word2Vec(main_w_vectors['clean_name'], min_count=5, size = 250)

In [0]:
# Check dimension of word vectors
model_name.vector_size

250

In [0]:
# model_name.similarity('women','men')

In [0]:
# summarize vocabulary
words_name = list(model_name.wv.vocab)

In [0]:
len(words_name)

25078

In [0]:
# Remove docs that don't include any words in W2V's vocab
corpus_name, name_list = filter_docs(corpus_name, name_list, lambda doc: has_vector_representation(model_name, doc))

5701 docs removed


In [0]:
# Filter out any empty docs
corpus_name, name_list = filter_docs(corpus_name, name_list, lambda doc: (len(doc) != 0))

0 docs removed


In [0]:
len(corpus_name)

1473122

In [0]:
# Initialize an array for the size of the corpus
y_name = []
for doc in corpus_name: # append the vector for each document

    y_name.append(document_vector(model_name, doc))
    
Y_name = np.array(y_name) # list to array

In [0]:
sc_name = StandardScaler()
Y_name = sc_name.fit_transform(Y_name)

In [0]:
from sklearn.decomposition import PCA

pca_name = PCA(n_components=80, random_state=10)

# x is the array with our 300-dimensional vectors
reduced_name = pca_name.fit_transform(Y_name)

In [0]:
np.sum(pca_name.explained_variance_ratio_)

0.9100646341224334

In [0]:
df_name = pd.DataFrame(reduced_name)

In [0]:
for i in range(len(df_name.columns)):
  df_name.rename(columns={i:'name_' + str(i)},inplace=True)

In [0]:
df_name.columns

Index(['name_0', 'name_1', 'name_2', 'name_3', 'name_4', 'name_5', 'name_6',
       'name_7', 'name_8', 'name_9', 'name_10', 'name_11', 'name_12',
       'name_13', 'name_14', 'name_15', 'name_16', 'name_17', 'name_18',
       'name_19', 'name_20', 'name_21', 'name_22', 'name_23', 'name_24',
       'name_25', 'name_26', 'name_27', 'name_28', 'name_29', 'name_30',
       'name_31', 'name_32', 'name_33', 'name_34', 'name_35', 'name_36',
       'name_37', 'name_38', 'name_39', 'name_40', 'name_41', 'name_42',
       'name_43', 'name_44', 'name_45', 'name_46', 'name_47', 'name_48',
       'name_49', 'name_50', 'name_51', 'name_52', 'name_53', 'name_54',
       'name_55', 'name_56', 'name_57', 'name_58', 'name_59', 'name_60',
       'name_61', 'name_62', 'name_63', 'name_64', 'name_65', 'name_66',
       'name_67', 'name_68', 'name_69', 'name_70', 'name_71', 'name_72',
       'name_73', 'name_74', 'name_75', 'name_76', 'name_77', 'name_78',
       'name_79'],
      dtype='object')

In [0]:
data1 = pd.concat((main_w_vectors,df_name), axis=1)

In [0]:
data1.drop(columns=['name','clean_name'],inplace=True)

In [0]:
data1.head(3)

Unnamed: 0,item_condition_id,brand_name,shipping,log_transform_price,words_log_counts,uppercase_perc,exclamation_log_count,star_log_count,percentage_log_count,rm_log_count,hashtag_log_count,price_log_median,price_log_mean,price_log_std,price_log_conf_variance,count_log,min_expected_log_price,max_expected_log_price,main_category,subcategory_1,subcategory_2,desc_0,desc_1,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9,desc_10,desc_11,desc_12,desc_13,desc_14,desc_15,desc_16,desc_17,desc_18,...,name_40,name_41,name_42,name_43,name_44,name_45,name_46,name_47,name_48,name_49,name_50,name_51,name_52,name_53,name_54,name_55,name_56,name_57,name_58,name_59,name_60,name_61,name_62,name_63,name_64,name_65,name_66,name_67,name_68,name_69,name_70,name_71,name_72,name_73,name_74,name_75,name_76,name_77,name_78,name_79
0,3,2670,1,2.397895,-1.435819,-0.463781,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,-1.019041,-1.022763,-0.969067,-0.981992,-0.954136,-1.003299,-1.021501,6,103,763,9.84202,-11.486503,-3.520439,-2.220166,-10.527919,2.234902,-1.910435,1.706773,-0.362397,0.520333,0.146969,0.129355,-0.567337,0.306127,-0.277588,0.922844,-0.287309,0.374081,-0.068961,...,0.385514,-0.252826,-0.319131,-0.067855,0.749231,-0.160498,0.004782,0.708699,0.618897,-0.672254,0.893254,0.35519,-0.20308,0.15101,0.026982,0.154432,-0.55814,0.068902,0.138718,-0.202356,0.085206,0.169362,0.018004,-0.413109,-0.057702,0.001333,0.200521,-0.163082,-0.034779,0.455513,0.036017,-0.207082,0.07509,-0.238771,-0.289314,0.798458,0.260261,-0.078188,-0.826296,-0.079283
1,3,3557,0,3.970292,0.827068,-0.281394,-0.514153,-0.201971,-0.216047,-0.287195,-0.061017,1.382559,1.348661,0.670262,0.367732,-0.067626,1.623006,1.189453,2,31,213,5.07797,0.609434,8.452345,-3.698251,1.349674,1.624934,1.034534,-3.599739,0.964364,0.385444,3.802707,-0.003164,-1.360943,1.394307,3.018989,0.154816,-0.854771,1.602899,-0.219291,...,-0.795553,-0.130088,-0.406521,0.811343,-0.033605,-0.699736,0.910489,0.40106,0.247002,-0.185844,0.849177,0.322614,-0.645346,-0.586519,0.259081,0.272965,-0.268578,0.436513,0.60214,-0.078495,-0.146016,-0.800016,-0.455247,-0.161626,0.379922,0.519259,-0.541651,-0.80069,0.303956,-0.575657,-0.147761,-0.233178,-0.02247,0.516617,-0.124734,0.229934,-0.008219,-0.408252,-0.56776,-0.07694
2,1,4180,1,2.397895,0.613741,-0.010959,1.63831,-0.201971,-0.216047,-0.287195,-0.061017,0.47669,0.467825,0.339902,0.732595,-0.159609,0.508109,0.440447,10,104,94,-5.251092,-4.758384,-4.926702,-1.778259,5.85339,4.831553,1.337444,3.458719,-2.616437,0.047622,-1.078015,1.633128,-2.27631,1.950964,-0.508073,-1.413029,-2.627453,0.508493,0.992363,...,0.888561,0.724288,-0.293922,-1.447286,-0.714286,-0.7663,0.439559,0.190552,-0.165196,1.258393,-0.582235,0.471049,-0.329858,0.402286,-0.976609,-0.333018,0.123391,-0.260517,0.781612,0.451114,0.353105,0.10111,0.58831,0.666095,-0.12972,0.118294,0.401009,0.229195,1.040812,-0.438278,-0.740536,-0.840557,0.474108,0.551017,0.468068,-0.80627,-0.846082,0.055663,-0.208924,1.202005


## loading to pickle

In [0]:
# from google.colab import drive
# drive.mount('/content/gdrive',force_remount=True)

In [0]:
# import os

In [0]:
# # Set input directory, and change working directory
# inDir = '/content/gdrive/My Drive/Colab Notebooks/Mercari(DataMiningProject)'          # IMPORTANT: Update to reflect directory on your OS
# os.path.isdir(inDir)                                                               # Change to working directory


In [0]:
# cd '/content/gdrive/My Drive/Colab Notebooks/Mercari(DataMiningProject)'

In [0]:
# data1.to_pickle(inDir+"/labelEncoded_withStats.pkl")