In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir("/content/drive/My Drive/mercari_capstone2")
!ls -l

total 344117
-rw------- 1 root root   2441360 Jun 23 10:05  1-EDA.ipynb
-rw------- 1 root root     40878 Jun 25 11:05 '2-featureization and pickleing.ipynb'
-rw------- 1 root root     51452 Jun 25 11:10  3_Hyperparameter_tuning.ipynb
-rw------- 1 root root     49476 Jun 25 11:07 '4-train deploy.ipynb'
-rw------- 1 root root    122880 Jun 24 19:02  Mercari_to1.db
drwx------ 2 root root      4096 Jun 23 11:42  pickle
-rw------- 1 root root  11853920 Jun 23 12:33  price_log.pickle
-rw------- 1 root root         0 Jun 25 11:05  tfidf.pickle
-rw------- 1 root root 337809843 Nov 11  2017  train.tsv


In [4]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.0.2

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==1.0.2
  Downloading scikit_learn-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.0.2


In [5]:
#importing modules/libraries
import pandas as pd
import numpy as np
import scipy
import gc

from tqdm.notebook import tqdm
import re
import random
import pickle
import inspect
import time

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelBinarizer,OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.linear_model import Ridge


import string

import nltk
nltk.download("stopwords")

nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer





import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
#function to load data
def Data(clock,n_rows):
  if int(n_rows) == -1:
    df = pd.read_csv('train.tsv', sep='\t')
  else:
    df = pd.read_csv('train.tsv', sep='\t',nrows =n_rows)

  df = df.drop(['train_id'], axis =1)
  df = df.drop(df[df.price <= 1.0].index)
  df.reset_index(inplace=True)
  df['item_condition_id'] = df['item_condition_id'].astype('category')
  # df =df[df['brand_name'].notnull()]
  y = np.log1p(df.price)
  gc.collect()
  print(f'[{round((time.time() - clock),2)}] {inspect.stack()[0][3]} completed')
  return df, y,round(df.shape[0]*0.8),df.shape[0]

In [7]:
def Impute(df, tr_len,clock):
  # imputing with a 'abs' or absent
    df['name'].fillna(value='abs', inplace=True)
    df['item_description'].fillna(value='No Description Yet', inplace=True)

  # using brands from train data only
    tr = df.iloc[:tr_len,:]
    test = df.iloc[tr_len:,:]

  # imputing with a 'abs' or absent for train data and using  brand
  # names form train data only as target and freqency encoding done
  # in later sections ofnotebook so avoiding data leakage
    tr['brand_name'].fillna(value='abs',inplace=True)
    brand_name =tr.brand_name.unique()
    # with open('pickle/impute_brand_name','wb') as f:
    #   pickle.dump(brand_name,f)
    test.loc[~test['brand_name'].isin(brand_name),'brand_name'] = 'abs'

    tr['category_name'].fillna(value='abs',inplace=True)
    category_name =tr.category_name.unique()
    # with open('pickle/impute_category_name','wb') as f:
    #   pickle.dump(category_name,f)
    test.loc[~test['category_name'].isin(category_name),'category_name'] = 'abs'

   # print fucntion completion time and with function name
    print(f'[{round((time.time() - clock),2)}] {inspect.stack()[0][3]} completed')
    del brand_name
    del df
    gc.collect()
    return pd.concat([tr,test],axis=0, ignore_index=True)

In [8]:
# a category column contains Nan or 3 or more sub category in it upto 5
# as rows with more than 3 or less than 3 categories are less than 0.1 percent,
# we make only 3 new cols with segregated category names
def sub_cat(row):
      try:
        split = row.split('/')
        if len(split) >= 3:
          return split[0],split[1],split[2]
        if len(split) == 2:
          return split[0], split[1], 'abs'
        elif len(split) == 1:
          return split[0], 'abs', 'abs'
        else:
          return 'abs', 'abs', 'abs'
      except Exception:
          return  'abs', 'abs', 'abs'

# extracting extra features from data
def Extract_features(df,tr_len,clock):
  # regex used later in this section to count number of them in a text column
    RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])
    non_alphanumpunct = re.compile(u'[^A-Za-z0-9\.?!,; \(\)\[\]\'\"\$]+')

  # extracting sub categories
    print(f'[{round(time.time()-clock)}]Extracting Subcat')
    df['sc1'], df['sc2'],df['sc3'] =  zip(*df['category_name'].apply(sub_cat))
    df.drop(columns='category_name',inplace = True)
    df['sc1'] = df['sc1'].astype('category')
    df['sc2'] = df['sc2'].astype('category')
    df['sc3'] = df['sc3'].astype('category')

  # has description or not/ missing value added as a feature
    print(f'[{round(time.time()-clock)}]Extracting HasDescription ')
    df['HasDescription'] = 1
    df.loc[df['item_description']=='No description yet', 'HasDescription'] = 0
    df['HasDescription'] =df['HasDescription'].astype('category')

  # has price or not/ [rm] values in textual columns are indicative of presence
  # price in the data which has been cleaned as suggested by the compition itself
    print(f'[{round(time.time()-clock)}]Extracting HasPrice ')
    df['HasPrice'] = 0
    df.loc[df['item_description'].str.contains('[rm]', regex=False), 'HasPrice'] = 1
    df.loc[df['name'].str.contains('[rm]', regex=False), 'HasPrice'] = 1
    df['HasPrice'] =df['HasPrice'].astype('category')

    gc.collect()
  # counting number of tokens in textual columns
    print(f'[{round(time.time()-clock)}]Extracting Token Count ')
    df['NameTokenCount'] = df['name'].str.split().apply(len)
    df['DescTokenCount'] = df['item_description'].str.split().apply(len)
    df['NameTokenCount'] = df['NameTokenCount'].astype('uint32')
    df['DescTokenCount'] = df['DescTokenCount'].astype('uint32')

  # ratio of token token counts in name and description columns(2 textual cols)

    print(f'[{round(time.time()-clock)}]Extracting Name to Desc token Ratio ')
    df['NameDescTokenRatio'] = df['NameTokenCount']/df['DescTokenCount']
    df['NameDescTokenRatio'] =df['NameDescTokenRatio'].astype('float32')

  # adding missing value as a feature for brand
    print(f'[{round(time.time()-clock)}]Extracting HasBrand ')
    df['HasBrand'] =1
    df.loc[df['brand_name'] == 'abs', 'HasBrand'] = 0
    df['HasBrand'] =df['HasBrand'].astype('category')

  # counting uppper and lower count of characters as EDA suggested phoney/
  # counterfiet items when listed uses too many bold and Caps charactes with emojis
    print(f'[{round(time.time()-clock)}]Extracting Lower count ')
    df['NameLowerCount'] = df.name.str.count('[a-z]')
    df['DescriptionLowerCount'] = df.item_description.str.count('[a-z]')
    df['NameLowerCount'] =df['NameLowerCount'].astype('uint32')
    df['DescriptionLowerCount'] =df['DescriptionLowerCount'].astype('uint32')


    print(f'[{round(time.time()-clock)}]Extracting Upper count ')
    df['NameUpperCount'] = df.name.str.count('[A-Z]')
    df['DescriptionUpperCount'] = df.item_description.str.count('[A-Z]')
    df['NameUpperCount'] =df['NameUpperCount'].astype('uint32')
    df['DescriptionUpperCount'] =df['DescriptionUpperCount'].astype('uint32')

  # punctuation count
    print(f'[{round(time.time()-clock)}]Extracting Punctuation Count ')
    df['NamePunctCount'] = df.name.str.count(RE_PUNCTUATION)
    df['DescriptionPunctCount'] = df.item_description.str.count(RE_PUNCTUATION)
    df['NamePunctCount'] =df['NamePunctCount'].astype('uint32')
    df['DescriptionPunctCount'] =df['DescriptionPunctCount'].astype('uint32')

  # punct count ratio
    print(f'[{round(time.time()-clock)}]Extracting Punctuation Ratio ')
    df['NamePunctCountRatio'] = df['NamePunctCount'] / df['NameTokenCount']
    df['DescriptionPunctCountRatio'] = df['DescriptionPunctCount'] / df['DescTokenCount']
    df['NamePunctCountRatio'] =df['NamePunctCountRatio'].astype('float32')
    df['DescriptionPunctCountRatio'] =df['DescriptionPunctCountRatio'].astype('float32')

  # digit count( if model can get a sense of bundled items)
    print(f'[{round(time.time()-clock)}]Extracting Digit count ')
    df['NameDigitCount'] = df.name.str.count('[0-9]')
    df['DescriptionDigitCount'] = df.item_description.str.count('[0-9]')
    df['NameDigitCount'] =df['NameDigitCount'].astype('uint32')
    df['DescriptionDigitCount'] =df['DescriptionDigitCount'].astype('uint32')

  # emoji and/or other nonalphanum count
    print(f'[{round(time.time()-clock)}]Extracting NonAlphaNum count ')
    df['NonAlphaDescCount'] = df['item_description'].str.count(non_alphanumpunct)
    df['NonAlphaNameCount'] = df['name'].str.count(non_alphanumpunct)
    df['NonAlphaDescCount'] =df['NonAlphaDescCount'].astype('uint32')
    df['NonAlphaNameCount'] =df['NonAlphaNameCount'].astype('uint32')


    cols = set(df.columns.values)
    non_num_col = {'name', 'item_condition_id', 'brand_name',
                  'shipping', 'item_description', 'sc1',
                  'sc2', 'sc3','HasDescription','HasPrice','HasBrand',
                   'price','index'
                  }

    cols_to_normalize = cols - non_num_col

  # normalizing all the counts and ratios
    print(f'[{round(time.time()-clock)}]Normalizing')
    df_to_normalize = df[list(cols_to_normalize)]
    df_to_normalize = (df_to_normalize - df_to_normalize.min()) / (df_to_normalize.max() - df_to_normalize.min())

    # with open('pickle/normalize_min','wb') as f:
    #   pickle.dump(df_to_normalize.min(),f)
    # with open('pickle/normalize_max','wb') as f:
    #   pickle.dump(df_to_normalize.max(),f)

    df = df[list(non_num_col)]
    df = pd.concat([df, df_to_normalize],axis=1)

    df.drop(columns='index',inplace=True)

    del(df_to_normalize)
    gc.collect()


    '''  extracting mean categorical and brand price with minding data leakage with addition of
    random noise so making data more robust. An idea taken form a some youtube video of a
    kaggle grandmaster . This noise addition clearly has impacted the performace of model very positively. '''

    print(f'[{round(time.time()-clock)}]Extracting Mean price Categories')

    tr = df.iloc[:tr_len,:]
    ts = df.iloc[tr_len:,:]
    lst = ['sc1','sc2','sc3', 'brand_name']

  #imputing values for nan with mean
    def boundary_case(hmap,key):
      try:
        return float(hmap[key])*np.random.normal(1,0.1)
      except:
        '''  when cases in test data are not
        present in train data mean_dict[feat] returns a nan to tackle that this part
        has been added( tho with normal usage it does not occur as this has been
        taken care of in  the imputation part itself , i had an experiment run which
        produced those cases so made this part as permanent only) '''
        np.mean(list(hmap.values()))*np.random.normal(1,0.1)


    for feat in lst:
        ''' for every categorical column in the list above  finding the mean price of
        every category in it and adding that price in a column with a noise added to it
        *np.randon.normal(1,0.1)'''
        mean_dc = (tr.groupby(feat)['price'].mean()/np.max(df.groupby(feat)['price'].mean())).to_dict()
        # with open(f'pickle/mean_dc_{feat}','wb') as f:
        #   pickle.dump(mean_dc,f)
        tr['MeanPrice_'+feat] = tr[feat].apply(lambda x : boundary_case(mean_dc,x)).astype(np.float32)
        tr['MeanPrice_'+feat].fillna( np.mean(list(mean_dc.values()))*np.random.normal(1,0.1), inplace=True  )


        ts['MeanPrice_'+feat] = ts[feat].apply(lambda x : boundary_case(mean_dc,x)).astype(np.float32)
        ts['MeanPrice_'+feat].fillna( np.mean(list(mean_dc.values()))*np.random.normal(1,0.1), inplace=True  )


    tr.drop(columns='price',inplace = True)
    ts.drop(columns='price',inplace = True)

    print(f'[{round((time.time() - clock),2)}] {inspect.stack()[0][3]} completed')

    del df,mean_dc
    gc.collect()
    return pd.concat([tr,ts],axis=0)

In [9]:
def Make_text_column(df,clock):
    '''As we saw in EDA that brands with NAN values can be imputed with names and item_desciption
    columns as there are brand names prevelent with more than 40 to 45 percent of chance.
    So instead of imputing so many brands (43 percent), just creating a new column by merging
    brands_name, name and item_description and making a text column and letting tfidf taking care of it.
    '''

    df['text'] = df['name'].astype(str)+' '+df['brand_name'].str.strip().astype(str)+' '+df['item_description'].str.strip().astype(str)
    df = df.drop(columns=['item_description'])


    def decontracted(text):
    # tried many kinds of regex to clean the data but final result wasnt effected much wiht
    # this part so only doing necessary onces
        try:
            text = re.sub(u"won't", "will not", text)
            text = re.sub(u"can\'t", "can not", text)
            text = re.sub(u"n\'t", " not", text)
            text = re.sub(u"\'t", " not", text)
            # separating digits for a sense of count if bundled items sold
            text = u" ".join(re.split('(\d+)',text) )

        except:
            print('error')
        return text

    def clean(df,col):
        non_alphanums = re.compile(u'[^A-Za-z0-9 ]+')
        # wl = WordNetLemmatizer()

        preprocessed_text = []
        for _,sentance in tqdm(df[col].iteritems(),total=df.shape[0]):

            sentance = decontracted(sentance)

            # nonalphanumeric character removal
            sentance = non_alphanums.sub(u' ', sentance)

            ''' did not lemmatize cause takes a lot of time and has negligible to no
            effect on performance
            did not convert to lower case as this takes a lot of time and can be done
            interensicly within TFIDF and Count vectorization along with text standardization'''
            # lemmetizing
            # sentance = ' '.join(wl.lemmatize(word.strip()) for word in sentance.split())
            sentance = ' '.join(word.strip() for word in sentance.split())

            preprocessed_text.append(sentance)
        df[col] = pd.Series(preprocessed_text).values
        del preprocessed_text
        return df


    print('Cleaning text')
    df= clean(df,'text')
    print(f'Done')

    print(f'[{round((time.time() - clock),2)}] {inspect.stack()[0][3]} completed')


    gc.collect()
    return df

In [10]:

def high_categorical(df,ts,col='brand_name'):

        ''' As brand has close to 5000 categories, converting it to numbers
        by Frequency encoding brand with minding data leakage, Tho  lgbm handles high order
        categorical columns efficiently this encoding boosted performance instead of just
        using OHE, also it is with addition of random noise so making data more robust and
        enhancing performance I had chosen to do the same with sub categorical columns but the
        did not perform that well and took extra memory space too'''

        dictionary_frq = df[col].value_counts().to_dict()
        dict_replace = {k:(v/max(dictionary_frq.values()) * np.random.normal(1,0.01)) for k,v in dictionary_frq.items()}
        # with open('pickle/brand_freq_dict','wb') as f:
        #   pickle.dump(dict_replace,f)
        col_cat: pd.Series.astype('float16') = df[col].map(dict_replace)
        col_cat_ts: pd.Series.astype('float16') = ts[col].map(dict_replace)
        del dictionary_frq
        del dict_replace
        gc.collect()
        return col_cat.values.reshape(-1,1),col_cat_ts.values.reshape(-1,1)

'''Converting all the data till yet to numeric form if not yet done'''
def Convert_to_predictor(df, tr_len,clock,stopwords=stopwords,high_categorical=high_categorical):
    try:
      df.drop(columns='index',inplace = True)
    except:
      pass

    print(f'[{round((time.time() - clock),2)}]Transform data start.')

    cv = OneHotEncoder()
    item_condition_id = cv.fit_transform(df['item_condition_id'].astype(int).iloc[:].values.reshape(-1,1))
    # with open('pickle/item_condition_id','wb') as f:
    #   pickle.dump(cv.fit(df['item_condition_id'].astype(int).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform item_condition_id  data completed.')


    cv = OneHotEncoder()
    shipping = cv.fit_transform(df['shipping'].astype(int).iloc[:].values.reshape(-1,1))
    # with open('pickle/shipping','wb') as f:
    #   pickle.dump(cv.fit(df['shipping'].astype(int).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform shipping  data completed.')


    cv = OneHotEncoder()
    HasDescription = cv.fit_transform(df['HasDescription'].astype(int).iloc[:].values.reshape(-1,1))
    # with open('pickle/HasDescription','wb') as f:
    #   pickle.dump(cv.fit(df['HasDescription'].astype(int).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform HasDescription  data completed.')


    cv = OneHotEncoder()
    HasPrice = cv.fit_transform(df['HasPrice'].astype(int).iloc[:].values.reshape(-1,1))
    # with open('pickle/HasPrice','wb') as f:
    #   pickle.dump(cv.fit(df['HasPrice'].astype(int).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform HasPrice  data completed.')


    cv = OneHotEncoder()
    HasBrand = cv.fit_transform(df['HasBrand'].astype(int).iloc[:].values.reshape(-1,1))
    # with open('pickle/HasBrand','wb') as f:
    #   pickle.dump(cv.fit(df['HasBrand'].astype(int).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform HasBrand  data completed.')

    cv = OneHotEncoder()
    sc1 = cv.fit_transform(df.sc1.astype(str).iloc[:].values.reshape(-1,1))
    # with open('pickle/sc1','wb') as f:
    #   pickle.dump(cv.fit(df.sc1.astype(str).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform sc1  data completed.')

    cv = OneHotEncoder()
    sc2 = cv.fit_transform(df.sc2.astype(str).iloc[:].values.reshape(-1,1))
    # with open('pickle/sc2','wb') as f:
    #   pickle.dump(cv.fit(df.sc2.astype(str).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform sc2  data completed.')

    cv = OneHotEncoder()
    sc3 = cv.fit_transform(df.sc3.astype(str).iloc[:].values.reshape(-1,1))
    # with open('pickle/sc3','wb') as f:
    #   pickle.dump(cv.fit(df.sc3.astype(str).iloc[:].values.reshape(-1,1)),f)
    print(f'[{round((time.time() - clock),2)}]Transform sc3  data completed.')




    df_dummies = scipy.sparse.hstack([item_condition_id, shipping, HasDescription, HasPrice,HasBrand,sc1,sc2,sc3])
    print(df_dummies.shape)

    df.drop(columns=['item_condition_id', 'shipping','HasDescription', 'HasPrice','HasBrand'],inplace=True)
    df.drop(columns=['sc1','sc2','sc3'],inplace=True)



    print(f'[{round((time.time() - clock),2)}]Transform categories data completed.')


    cols = ['NamePunctCount', 'NameDigitCount', 'DescriptionDigitCount',\
            'NameUpperCount', 'DescriptionPunctCount', 'DescriptionPunctCountRatio', \
            'DescTokenCount', 'DescriptionUpperCount', 'NonAlphaDescCount', \
            'NonAlphaNameCount', 'NameTokenCount', 'NameLowerCount', \
            'NameDescTokenRatio', 'DescriptionLowerCount', 'NamePunctCountRatio',\
            'MeanPrice_sc1','MeanPrice_sc2','MeanPrice_sc3','MeanPrice_brand_name']

    df_num = scipy.sparse.csc_matrix(df[cols].values)

    df.drop(columns=cols,inplace=True)
    print(df_num.shape)

    print(f'[{round((time.time() - clock),2)}]Transform numeric  data completed.')

    gc.collect()

    tr = df.iloc[:tr_len,:]
    test = df.iloc[tr_len:,:]
    gc.collect()
    del df
    vect = CountVectorizer(ngram_range=(1,3),min_df=5, max_df=0.85,
                         lowercase=True, max_features=50000,
                        analyzer='word', strip_accents = 'ascii',
                        stop_words= "english")

    tr_name = scipy.sparse.csr_matrix(vect.fit_transform(tr.name))
    ts_name = scipy.sparse.csr_matrix(vect.transform(test.name))
    df_name = scipy.sparse.vstack((tr_name,ts_name),format='csc')
    # with open('pickle/count_vect','wb') as f:
    #       pickle.dump(vect.fit(tr.name),f)
    tr.drop(columns=['name'],inplace=True)
    test.drop(columns=['name'],inplace=True)

    del vect,ts_name,tr_name
    print(df_name.shape)

    print(f'[{round((time.time() - clock),2)}]Transform name data completed.')



    vect = TfidfVectorizer(ngram_range=(1,3),min_df=5, max_df=0.85,
                         lowercase=True, max_features=100000,
                        analyzer='word', strip_accents = 'ascii', smooth_idf=True,stop_words= "english")

    tr_text = scipy.sparse.csr_matrix(vect.fit_transform(tr.text))
    ts_text = scipy.sparse.csr_matrix(vect.transform(test.text))
    df_text = scipy.sparse.vstack((tr_text,ts_text),format='csc')
    # with open('pickle/tfidf_vect','wb') as f:
    #       pickle.dump(vect.fit(tr.text),f)
    tr.drop(columns=['text'],inplace=True)
    test.drop(columns=['text'],inplace=True)

    del vect,ts_text,tr_text
    print(df_text.shape)

    print(f'[{round((time.time() - clock),2)}]Transform text data completed.')




    # frequency encoding brands
    tr_brand,ts_brand = high_categorical(tr,test)
    tr_brand,ts_brand = scipy.sparse.csr_matrix(tr_brand),scipy.sparse.csr_matrix(ts_brand)
    df_brand = scipy.sparse.vstack((tr_brand,ts_brand),format='csc')
    print(df_brand.shape)

    tr.drop(columns=['brand_name'],inplace=True)
    test.drop(columns=['brand_name'],inplace=True)
    del tr_brand,ts_brand,high_categorical

    print(f'[{round((time.time() - clock),2)}]Transform brand data completed.')



    df_merge = scipy.sparse.hstack((df_brand,df_dummies, df_num,df_name, df_text ))
    print('Merge all data completed.')


    del df_brand,df_num,df_text,df_name,df_dummies
    print(f'[{round((time.time() - clock),2)}] {inspect.stack()[0][3]} complete')

    gc.collect()
    return df_merge

In [11]:
clock =time.time()
df,y, tr_len,whole_tr= Data(clock,n_rows = -1)
gc.collect()
df = Impute(df,tr_len,clock)
gc.collect()
df = Extract_features(df,tr_len,clock)
gc.collect()
df = Make_text_column(df,clock)
gc.collect()
df = Convert_to_predictor(df,tr_len,clock)
gc.collect()

[9.99] Data completed
[11.3] Impute completed
[12]Extracting Subcat
[18]Extracting HasDescription 
[18]Extracting HasPrice 
[20]Extracting Token Count 
[38]Extracting Name to Desc token Ratio 
[38]Extracting HasBrand 
[38]Extracting Lower count 
[71]Extracting Upper count 
[79]Extracting Punctuation Count 
[85]Extracting Punctuation Ratio 
[85]Extracting Digit count 
[90]Extracting NonAlphaNum count 
[98]Normalizing
[99]Extracting Mean price Categories
[104.97] Extract_features completed
Cleaning text


  0%|          | 0/1481661 [00:00<?, ?it/s]

Done
[143.9] Make_text_column completed
[144.22]Transform data start.
[144.45]Transform item_condition_id  data completed.
[144.56]Transform shipping  data completed.
[144.67]Transform HasDescription  data completed.
[144.77]Transform HasPrice  data completed.
[144.89]Transform HasBrand  data completed.
[145.76]Transform sc1  data completed.
[146.72]Transform sc2  data completed.
[147.89]Transform sc3  data completed.
(1481661, 1002)
[148.65]Transform categories data completed.
(1481661, 19)
[150.32]Transform numeric  data completed.
(1481661, 50000)
[185.7]Transform name data completed.
(1481661, 100000)
[406.39]Transform text data completed.
(1481661, 1)
[407.12]Transform brand data completed.
Merge all data completed.
[409.16] Convert_to_predictor complete


0

In [12]:
df.shape

(1481661, 151022)

In [13]:
with open('tfidf.pickle','wb') as f:
  pickle.dump(df,f)

In [14]:
with open('price_log.pickle','wb') as f:
  pickle.dump(y,f)