In [1]:
#Initially forked from Bojan's kernel here: https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lb-0-2242/code
#improvement using kernel from Nick Brook's kernel here: https://www.kaggle.com/nicapotato/bow-meta-text-and-dense-features-lgbm
#Used oof method from Faron's kernel here: https://www.kaggle.com/mmueller/stacking-starter?scriptVersionId=390867
#Used some text cleaning method from Muhammad Alfiansyah's kernel here: https://www.kaggle.com/muhammadalfiansyah/push-the-lgbm-v19
#Forked From - https://www.kaggle.com/him4318/avito-lightgbm-with-ridge-feature-v-2-0

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import random
random.seed(2018)
print("Data:\n",os.listdir("input"))

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string


Data:
 ['image_pred', 'image_quality', 'periods_test.csv', 'periods_train.csv', 'ridge.csv', 'sample_submission.csv', 'svd_all.csv', 'test.csv', 'testimagefeature.csv', 'test_feature.csv', 'test_feature_mean_sd.csv', 'train.csv', 'trainimagefeature.csv', 'train_feature.csv', 'train_feature_mean_sd.csv']




In [2]:
NFOLDS = 5
SEED = 2018
VALID = False
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [3]:
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    
def cleanName(text):
    try:
        textProc = text.lower()
        # textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        #regex = re.compile(u'[^[:alpha:]]')
        #textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
    
    
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

print("\nData Load Stage")
training = pd.read_csv('input/train.csv',index_col = "item_id", parse_dates = ["activation_date"])
traindex = training.index
training['item_id'] = traindex
testing = pd.read_csv('input/test.csv',index_col = "item_id" , parse_dates = ["activation_date"])
testdex = testing.index
testing['item_id'] = testdex

ntrain = training.shape[0]
ntest = testing.shape[0]

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))


print("Feature Engineering")
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(df.price.mean(),inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

print("\nCreate Time Variables")
df["Weekday"] = df['activation_date'].dt.weekday
#df["Weekd of Year"] = df['activation_date'].dt.week
#df["Day of Month"] = df['activation_date'].dt.day

# Create Validation Index and Remove Dead Variables
training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index
validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index
df.drop(["activation_date"],axis=1,inplace=True)

print("\nEncode Variables")
categorical = ["user_id","region","city","parent_category_name","category_name","user_type","image_top_1","param_1","param_2","param_3"]
print("Encoding :",categorical)

# Encoder:
lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col].fillna('Unknown')
    df[col] = lbl.fit_transform(df[col].astype(str))
    
print("\nText Features")

# Feature Engineering 

# Meta Text Features
textfeats = ["description", "title"]
df['desc_punc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))

for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
    df[cols + '_num_letters'] = df[cols].apply(lambda comment: len(comment)) # Count number of Letters
    df[cols + '_num_alphabets'] = df[cols].apply(lambda comment: (comment.count(r'[a-zA-Z]'))) # Count number of Alphabets
    df[cols + '_num_alphanumeric'] = df[cols].apply(lambda comment: (comment.count(r'[A-Za-z0-9]'))) # Count number of AlphaNumeric
    df[cols + '_num_digits'] = df[cols].apply(lambda comment: (comment.count('[0-9]'))) # Count number of Digits
    
# Extra Feature Engineering
df['title_desc_len_ratio'] = df['title_num_letters']/df['description_num_letters']
df.image = df.image+".jpg"



Data Load Stage
Train shape: 1503424 Rows, 17 Columns
Test shape: 508438 Rows, 17 Columns
Combine Train and Test

All Data shape: 2011862 Rows, 17 Columns
Feature Engineering

Create Time Variables

Encode Variables
Encoding : ['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3']

Text Features


In [4]:
df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,...,description_num_alphanumeric,description_num_digits,title_num_words,title_num_unique_words,title_words_vs_unique,title_num_letters,title_num_alphabets,title_num_alphanumeric,title_num_digits,title_desc_len_ratio
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,884270,19,462,4,42,249,112,1217,кокоби(кокон для сна),"кокон для сна малыша,пользовались меньше месяц...",...,0,0,3,3,100.0,21,0,0,0,0.362069
2dac0150717d,227908,17,1314,2,22,122,112,1217,стойка для одежды,"стойка для одежды, под вешалки. с бутика.",...,0,0,3,3,100.0,17,0,0,0,0.414634
ba83aefab5dc,576261,16,1290,0,2,84,112,1217,philips bluray,"в хорошем состоянии, домашний кинотеатр с blu ...",...,0,0,2,2,100.0,14,0,0,0,0.141414
02996f1dd2ea,755087,21,950,4,42,38,112,1217,автокресло,продам кресло от0-25кг,...,0,0,1,1,100.0,10,0,0,0,0.454545
7c90be56d2ab,944363,4,318,6,0,278,124,46,"ваз 2110, 2003",все вопросы по телефону.,...,0,0,3,3,100.0,14,0,0,0,0.583333


# Input Image Quality Feature

In [5]:
image_quality_1 = pd.read_csv("input/image_quality/features/_.csv")
image_quality_2 = pd.read_csv("input/image_quality/features/test.csv")
image_quality_3 = pd.read_csv("input/image_quality/features/train-0.csv")
image_quality_4 = pd.read_csv("input/image_quality/features/train-1.csv")
image_quality_5 = pd.read_csv("input/image_quality/features/train-2.csv")
image_quality_6 = pd.read_csv("input/image_quality/features/train-3.csv")
image_quality_7 = pd.read_csv("input/image_quality/features/train-4.csv")
image_quality_all = pd.concat([image_quality_1,image_quality_2,image_quality_3,image_quality_4,image_quality_5,
                              image_quality_6,image_quality_7])
print('\nAll Data shape: {} Rows, {} Columns'.format(*image_quality_all.shape))
image_quality_all = image_quality_all.drop_duplicates(subset=['image'])
image_quality_all.head()


All Data shape: 1856724 Rows, 14 Columns


Unnamed: 0,image,dullness,whiteness,average_pixel_width,dominant_red,dominant_green,dominant_blue,average_red,average_green,average_blue,image_size,width,height,blurrness
0,14fc31e727ee3bb3f5caa8ab08a5a0228eb2165bf37191...,0.0,0.0,1.236615,0.192157,0.462745,0.803922,0.272582,0.485153,0.752708,17367,358,480,142.531671
1,14fdd39ca31b1bf973e9399c33e298447ad660c362066c...,1.41,5.135,1.484954,0.882353,0.890196,0.901961,0.62205,0.614842,0.610944,17558,360,480,202.873965
2,14f0a45554d235941804f9b3a296b239c956bcc6a604e9...,20.945,0.0,3.634838,0.098039,0.109804,0.098039,0.253908,0.305005,0.306083,42891,480,360,650.484486
3,14f74402cb0e2cb3f7bd1b09904c94e3e7bb97fa86112b...,0.0,45.4,1.889468,0.113725,0.109804,0.133333,0.374054,0.390081,0.426311,41321,360,480,284.560724
4,14fc0f4b3ef7a8b2989112956fe40a33bc7eb8f04ee27d...,0.0,49.915,1.972147,0.992157,0.992157,0.992157,0.511805,0.522933,0.518476,10292,371,360,279.432978


In [6]:
print('\nAll Data shape: {} Rows, {} Columns'.format(*image_quality_all.shape))


All Data shape: 1856661 Rows, 14 Columns


In [7]:
len(pd.unique(image_quality_all.image))

1856661

In [8]:
new_df = pd.merge(df, image_quality_all, how='left', on=['image']).set_index(df.index)

In [9]:
new_df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,...,dominant_red,dominant_green,dominant_blue,average_red,average_green,average_blue,image_size,width,height,blurrness
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,884270,19,462,4,42,249,112,1217,кокоби(кокон для сна),"кокон для сна малыша,пользовались меньше месяц...",...,0.086275,0.039216,0.47451,0.36097,0.320288,0.61285,27039.0,358.0,480.0,398.109961
2dac0150717d,227908,17,1314,2,22,122,112,1217,стойка для одежды,"стойка для одежды, под вешалки. с бутика.",...,0.117647,0.152941,0.188235,0.297393,0.366584,0.422223,30385.0,360.0,480.0,1014.604608
ba83aefab5dc,576261,16,1290,0,2,84,112,1217,philips bluray,"в хорошем состоянии, домашний кинотеатр с blu ...",...,0.992157,0.992157,0.992157,0.703339,0.703571,0.703575,18681.0,392.0,360.0,493.921065
02996f1dd2ea,755087,21,950,4,42,38,112,1217,автокресло,продам кресло от0-25кг,...,0.992157,0.992157,0.992157,0.851711,0.846459,0.846097,13656.0,360.0,360.0,377.105718
7c90be56d2ab,944363,4,318,6,0,278,124,46,"ваз 2110, 2003",все вопросы по телефону.,...,0.537255,0.615686,0.580392,0.449326,0.512986,0.485608,36710.0,640.0,360.0,557.35108


In [10]:
print('\nAll Data shape: {} Rows, {} Columns'.format(*new_df.shape))


All Data shape: 2011862 Rows, 46 Columns


In [11]:
new_df.columns[33]

'dullness'

# Deal Image quality NA

In [12]:
#33,45 column
for i in range(33,46):
    #print(i)
    new_df[new_df.columns[i]].fillna(-1, inplace=True)

In [13]:
new_df.drop("image",axis=1, inplace=True)
new_df.columns

Index(['user_id', 'region', 'city', 'parent_category_name', 'category_name',
       'param_1', 'param_2', 'param_3', 'title', 'description', 'price',
       'item_seq_number', 'user_type', 'image_top_1', 'item_id', 'Weekday',
       'desc_punc', 'description_num_words', 'description_num_unique_words',
       'description_words_vs_unique', 'description_num_letters',
       'description_num_alphabets', 'description_num_alphanumeric',
       'description_num_digits', 'title_num_words', 'title_num_unique_words',
       'title_words_vs_unique', 'title_num_letters', 'title_num_alphabets',
       'title_num_alphanumeric', 'title_num_digits', 'title_desc_len_ratio',
       'dullness', 'whiteness', 'average_pixel_width', 'dominant_red',
       'dominant_green', 'dominant_blue', 'average_red', 'average_green',
       'average_blue', 'image_size', 'width', 'height', 'blurrness'],
      dtype='object')

In [14]:
new_df.isnull().any()

user_id                         False
region                          False
city                            False
parent_category_name            False
category_name                   False
param_1                         False
param_2                         False
param_3                         False
title                           False
description                     False
price                           False
item_seq_number                 False
user_type                       False
image_top_1                     False
item_id                         False
Weekday                         False
desc_punc                       False
description_num_words           False
description_num_unique_words    False
description_words_vs_unique      True
description_num_letters         False
description_num_alphabets       False
description_num_alphanumeric    False
description_num_digits          False
title_num_words                 False
title_num_unique_words          False
title_words_

# SVD & Stat Feature

In [19]:
svd_all = pd.read_csv("input/svd_all.csv")
#svd_all = pd.concat([svd_all[svd_all.columns[1]],svd_all[svd_all.columns[range(17,36)]]],axis=1)

In [20]:
svd_all.head()
print('\nAll Data shape: {} Rows, {} Columns'.format(*svd_all.shape))


All Data shape: 2011862 Rows, 36 Columns


In [21]:
svd_all = svd_all[['item_id','category_name_price_median', 'category_name_deal_probability_median',
       'image_top_1_price_median', 'image_top_1_deal_probability_median',
        'category_name_price_mean',
       'category_name_deal_probability_mean', 'image_top_1_price_mean',
       'image_top_1_deal_probability_mean', 'category_name_price_std',
       'category_name_deal_probability_std', 'image_top_1_price_std',
       'image_top_1_deal_probability_std', 'category_price_skewness',
       'category_deal_prob_skewness', 'category_price_kurtosis',
       'category_deal_prob_kurtosis', 'image_top_1_price_skewness',
       'image_top_1_deal_prob_skewness', 'image_top_1_price_kurtosis',
       'image_top_1_deal_prob_kurtosis']]
svd_all.head()

Unnamed: 0,item_id,category_name_price_median,category_name_deal_probability_median,image_top_1_price_median,image_top_1_deal_probability_median,category_name_price_mean,category_name_deal_probability_mean,image_top_1_price_mean,image_top_1_deal_probability_mean,category_name_price_std,...,image_top_1_price_std,image_top_1_deal_probability_std,category_price_skewness,category_deal_prob_skewness,category_price_kurtosis,category_deal_prob_kurtosis,image_top_1_price_skewness,image_top_1_deal_prob_skewness,image_top_1_price_kurtosis,image_top_1_deal_prob_kurtosis
0,b912c3c6a6ad,-0.030557,0.0,-0.10025,0.0,-0.090534,0.198445,-0.131348,0.089599,0.511131,...,0.470083,0.220444,184.591529,1.246062,42148.342164,-0.286112,22.63408,2.652281,513.446586,5.669475
1,2dac0150717d,0.164431,0.0,0.275801,0.0,0.051252,0.191848,0.30143,0.15934,0.644374,...,0.756532,0.292832,171.904482,1.444086,31356.282854,0.364049,9.197589,1.635184,100.34428,0.939342
2,ba83aefab5dc,0.059309,0.0,0.312685,0.0,-0.011544,0.171572,0.277063,0.201906,0.618374,...,0.66014,0.297891,125.57018,1.424314,18136.60842,0.352266,10.826907,1.206054,134.817678,-0.209372
3,02996f1dd2ea,-0.030557,0.0,0.059309,0.11151,-0.090534,0.198445,0.082795,0.336084,0.511131,...,0.296475,0.371045,184.591529,1.246062,42148.342164,-0.286112,12.631593,0.385469,160.81431,-1.748527
4,7c90be56d2ab,1.542222,0.15342,1.41887,0.17209,1.448634,0.278427,1.256043,0.291826,0.633206,...,0.804934,0.300635,5.933887,0.60259,61.987845,-1.298523,9.487797,0.418632,250.672182,-1.419303


In [22]:
new_df = pd.merge(new_df,svd_all,how="left",on=["item_id"]).set_index(new_df.index)
print('\nAll Data shape: {} Rows, {} Columns'.format(*new_df.shape))
new_df.isnull().any()

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)



All Data shape: 2011862 Rows, 65 Columns


user_id                                  False
region                                   False
city                                     False
parent_category_name                     False
category_name                            False
param_1                                  False
param_2                                  False
param_3                                  False
title                                    False
description                              False
price                                    False
item_seq_number                          False
user_type                                False
image_top_1                              False
item_id                                  False
Weekday                                  False
desc_punc                                False
description_num_words                    False
description_num_unique_words             False
description_words_vs_unique               True
description_num_letters                  False
description_n

In [23]:
new_df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,...,image_top_1_price_std,image_top_1_deal_probability_std,category_price_skewness,category_deal_prob_skewness,category_price_kurtosis,category_deal_prob_kurtosis,image_top_1_price_skewness,image_top_1_deal_prob_skewness,image_top_1_price_kurtosis,image_top_1_deal_prob_kurtosis
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,884270,19,462,4,42,249,112,1217,кокоби(кокон для сна),"кокон для сна малыша,пользовались меньше месяц...",...,0.470083,0.220444,184.591529,1.246062,42148.342164,-0.286112,22.63408,2.652281,513.446586,5.669475
2dac0150717d,227908,17,1314,2,22,122,112,1217,стойка для одежды,"стойка для одежды, под вешалки. с бутика.",...,0.756532,0.292832,171.904482,1.444086,31356.282854,0.364049,9.197589,1.635184,100.34428,0.939342
ba83aefab5dc,576261,16,1290,0,2,84,112,1217,philips bluray,"в хорошем состоянии, домашний кинотеатр с blu ...",...,0.66014,0.297891,125.57018,1.424314,18136.60842,0.352266,10.826907,1.206054,134.817678,-0.209372
02996f1dd2ea,755087,21,950,4,42,38,112,1217,автокресло,продам кресло от0-25кг,...,0.296475,0.371045,184.591529,1.246062,42148.342164,-0.286112,12.631593,0.385469,160.81431,-1.748527
7c90be56d2ab,944363,4,318,6,0,278,124,46,"ваз 2110, 2003",все вопросы по телефону.,...,0.804934,0.300635,5.933887,0.60259,61.987845,-1.298523,9.487797,0.418632,250.672182,-1.419303


In [19]:
new_df[['category_name_price_median', 'category_name_deal_probability_median',
       'image_top_1_price_median', 'image_top_1_deal_probability_median',
        'category_name_price_mean',
       'category_name_deal_probability_mean', 'image_top_1_price_mean',
       'image_top_1_deal_probability_mean', 'category_name_price_std',
       'category_name_deal_probability_std', 'image_top_1_price_std',
       'image_top_1_deal_probability_std', 'category_price_skewness',
       'category_deal_prob_skewness', 'category_price_kurtosis',
       'category_deal_prob_kurtosis', 'image_top_1_price_skewness',
       'image_top_1_deal_prob_skewness', 'image_top_1_price_kurtosis',
       'image_top_1_deal_prob_kurtosis']]

Unnamed: 0_level_0,category_name_price_median,category_name_deal_probability_median,image_top_1_price_median,image_top_1_deal_probability_median,category_name_price_mean,category_name_deal_probability_mean,image_top_1_price_mean,image_top_1_deal_probability_mean,category_name_price_std,category_name_deal_probability_std,image_top_1_price_std,image_top_1_deal_probability_std,category_price_skewness,category_deal_prob_skewness,category_price_kurtosis,category_deal_prob_kurtosis,image_top_1_price_skewness,image_top_1_deal_prob_skewness,image_top_1_price_kurtosis,image_top_1_deal_prob_kurtosis
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
b912c3c6a6ad,-0.030557,0.00000,-0.100250,0.00000,-0.090534,0.198445,-0.131348,0.089599,0.511131,0.320879,0.470083,0.220444,184.591529,1.246062,42148.342164,-0.286112,22.634080,2.652281,513.446586,5.669475
2dac0150717d,0.164431,0.00000,0.275801,0.00000,0.051252,0.191848,0.301430,0.159340,0.644374,0.320883,0.756532,0.292832,171.904482,1.444086,31356.282854,0.364049,9.197589,1.635184,100.344280,0.939342
ba83aefab5dc,0.059309,0.00000,0.312685,0.00000,-0.011544,0.171572,0.277063,0.201906,0.618374,0.282541,0.660140,0.297891,125.570180,1.424314,18136.608420,0.352266,10.826907,1.206054,134.817678,-0.209372
02996f1dd2ea,-0.030557,0.00000,0.059309,0.11151,-0.090534,0.198445,0.082795,0.336084,0.511131,0.320879,0.296475,0.371045,184.591529,1.246062,42148.342164,-0.286112,12.631593,0.385469,160.814310,-1.748527
7c90be56d2ab,1.542222,0.15342,1.418870,0.17209,1.448634,0.278427,1.256043,0.291826,0.633206,0.297765,0.804934,0.300635,5.933887,0.602590,61.987845,-1.298523,9.487797,0.418632,250.672182,-1.419303
51e0962387f7,-0.030557,0.00000,0.059309,0.11151,-0.090534,0.198445,0.082795,0.336084,0.511131,0.320879,0.296475,0.371045,184.591529,1.246062,42148.342164,-0.286112,12.631593,0.385469,160.814310,-1.748527
c4f260a2b48a,-0.030557,0.00000,0.058526,0.00000,-0.225639,0.146211,0.018319,0.159649,0.903055,0.274868,0.317981,0.301083,153.638708,1.898789,27319.289176,2.176619,3.850791,1.448866,18.233020,0.223076
6b71309d6a8a,-0.207926,0.00000,-0.373521,0.00000,-0.214467,0.046447,-0.364262,0.039809,0.438881,0.155944,0.314883,0.147534,282.319121,4.109035,87195.464750,16.565773,9.802475,4.460623,99.314377,19.621860
c5b969cb63a2,-0.207926,0.00000,-0.406383,0.00000,-0.214467,0.046447,-0.434304,0.024007,0.438881,0.155944,0.343375,0.111054,282.319121,4.109035,87195.464750,16.565773,8.473389,5.977580,69.943695,37.548866
b1570962e68c,-0.373521,0.00000,-0.373521,0.00000,-0.422395,0.060834,-0.383654,0.061049,0.365909,0.189677,0.340234,0.189119,192.018228,3.379158,38565.857420,10.123731,64.640806,3.359346,4177.621587,10.029663


In [25]:
stat_feature = ['category_name_price_median', 'category_name_deal_probability_median',
       'image_top_1_price_median', 'image_top_1_deal_probability_median',
        'category_name_price_mean',
       'category_name_deal_probability_mean', 'image_top_1_price_mean',
       'image_top_1_deal_probability_mean', 'category_name_price_std',
       'category_name_deal_probability_std', 'image_top_1_price_std',
       'image_top_1_deal_probability_std', 'category_price_skewness',
       'category_deal_prob_skewness', 'category_price_kurtosis',
       'category_deal_prob_kurtosis', 'image_top_1_price_skewness',
       'image_top_1_deal_prob_skewness', 'image_top_1_price_kurtosis',
       'image_top_1_deal_prob_kurtosis']

In [26]:
for feature in stat_feature:
    new_df[feature].fillna((new_df[feature].mean(skipna=True)), inplace=True)

In [27]:
print('\nAll Data shape: {} Rows, {} Columns'.format(*new_df.shape))
new_df.isnull().any()


All Data shape: 2011862 Rows, 65 Columns


user_id                                  False
region                                   False
city                                     False
parent_category_name                     False
category_name                            False
param_1                                  False
param_2                                  False
param_3                                  False
title                                    False
description                              False
price                                    False
item_seq_number                          False
user_type                                False
image_top_1                              False
item_id                                  False
Weekday                                  False
desc_punc                                False
description_num_words                    False
description_num_unique_words             False
description_words_vs_unique               True
description_num_letters                  False
description_n

# TF-IDF 

In [28]:
print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}


[TF-IDF] Term Frequency Inverse Document Frequency Stage


In [None]:
def get_col(col_name): return lambda x: x[col_name]
##I added to the max_features of the description. It did not change my score much but it may be worth investigating
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=17000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('title',CountVectorizer(
            ngram_range=(1, 2),
            stop_words = russian_stop,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()

#Fit my vectorizer on the entire dataset instead of the training rows
#Score improved by .0001
vectorizer.fit(new_df.to_dict('records'))

ready_df = vectorizer.transform(new_df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# Drop Text Cols
textfeats = ["description", "title"]
new_df.drop(textfeats, axis=1,inplace=True)

In [None]:
print('\nAll Data shape: {} Rows, {} Columns'.format(*ready_df.shape))

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

ridge_params = {'alpha':30.0, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
                'max_iter':None, 'tol':0.001, 'solver':'auto', 'random_state':SEED}

#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
ridge = SklearnWrapper(clf=Ridge, seed = SEED, params = ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:ntrain], y, ready_df[ntrain:])

rms = sqrt(mean_squared_error(y, ridge_oof_train))
print('Ridge OOF RMSE: {}'.format(rms))

print("Modeling Stage")

ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])

new_df['ridge_preds'] = ridge_preds

In [None]:
tmp = new_df 
new_df.drop("item_id",axis=1, inplace=True)

In [None]:
new_df.dtypes

In [None]:
# Combine Dense Features with Sparse Text Bag of Words Features
X = hstack([csr_matrix(new_df.loc[traindex,:].values),ready_df[0:traindex.shape[0]]]) # Sparse Matrix
testing = hstack([csr_matrix(new_df.loc[testdex,:].values),ready_df[traindex.shape[0]:]])
tfvocab = new_df.columns.tolist() + tfvocab
for shape in [X,testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ",len(tfvocab))

gc.collect();

print("\nModeling Stage")

del ridge_preds,vectorizer,ready_df
gc.collect();
df.head()

In [None]:
print("Light Gradient Boosting Regressor")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'max_depth': 15,
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.0175,
    'verbose': 0
}  


if VALID == True:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.10, random_state=2018)
        
    # LGBM Dataset Formatting 
    lgtrain = lgb.Dataset(X_train, y_train,
                    feature_name=tfvocab,
                    categorical_feature = categorical)
    lgvalid = lgb.Dataset(X_valid, y_valid,
                    feature_name=tfvocab,
                    categorical_feature = categorical)
    del X, X_train; gc.collect()
    
    # Go Go Go
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=20000,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    print("Model Evaluation Stage")
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
    del X_valid ; gc.collect()

else:
    # LGBM Dataset Formatting 
    lgtrain = lgb.Dataset(X, y,
                    feature_name=tfvocab,
                    categorical_feature = categorical)
    del X; gc.collect()
    # Go Go Go
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=2250,
        verbose_eval=100
    )

In [None]:
# Feature Importance Plot
f, ax = plt.subplots(figsize=[7,10])
lgb.plot_importance(lgb_clf, max_num_features=20, ax=ax)
plt.title("Light GBM Feature Importance")
plt.savefig('feature_import.png')

print("Model Evaluation Stage")
lgpred = lgb_clf.predict(testing) 
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

In [None]:
#Mixing lightgbm with ridge. I haven't really tested if this improves the score or not
#blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0]
lgsub = pd.DataFrame(lgpred,columns=["deal_probability"],index=testdex)
lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1
lgsub.to_csv("lgsub.csv",index=True,header=True)

In [None]:
lgsub.head()