In [1]:
import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
#import nltk
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# LDA
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

NFOLDS = 5
SEED = 1234
VALID = True



In [2]:
def cleanName(text):
    try:
        textProc = text.lower()
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
    
    
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

In [3]:
print("\nData Load Stage")
training = pd.read_csv('/home/stanaya/.kaggle/competitions/avito-demand-prediction/train.csv', index_col = "item_id", parse_dates = ["activation_date"])
traindex = training.index
testing = pd.read_csv('/home/stanaya/.kaggle/competitions/avito-demand-prediction/test.csv', index_col = "item_id", parse_dates = ["activation_date"])
testdex = testing.index

ntrain = training.shape[0]
ntest = testing.shape[0]

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))


Data Load Stage
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns


In [4]:
# 学習データとテストデータを統合
print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

Combine Train and Test

All Data shape: 2011862 Rows, 16 Columns


In [5]:
print("\nText Features")

# タイトルと説明文を正規化する
df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))


Text Features


In [6]:
## 
df_text = df[["title", "description"]]

In [7]:
dic_replace_title = {',': ' ', '(':' ', ')': ' ', '.':' '}
dic_replace_desc = {',': ' ', '(':' ', ')': ' ', '.':' ', '\n': '', '\r': '', '/': ''}

def replaceTitleSomeCharSplit(s):
    return list(filter(None, s.translate(str.maketrans(dic_replace_title)).split(" ")))

def replaceDescSomeCharSplit(s):
    return list(filter(None, s.translate(str.maketrans(dic_replace_desc)).split(" ")))


def replaceTitleSomeChar(s):
    return s.translate(str.maketrans(dic_replace_title))

def replaceDescSomeChar(s):
    return s.translate(str.maketrans(dic_replace_desc))

In [8]:
df_text["title_split"] = df_text["title"].map(replaceTitleSomeChar)
df_text["desc_split"] = df_text["description"].map(replaceDescSomeChar)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
df_text_split = df_text[['title_split', 'desc_split']]

In [10]:
corpusTitle = df_text_split['title_split'].as_matrix()
corpusDesc = df_text_split['desc_split'].as_matrix()

In [11]:
print(len(corpusTitle))
print(len(corpusDesc))

2011862
2011862


In [12]:
gc.collect()

0

In [13]:
## Topic Model
print("\nTopic Modeling!")
NUM_TOPICS = 150
russian_stop = set(stopwords.words('russian'))
def get_col(col_name): return lambda x: x[col_name]

#vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                            stop_words='russian', lowercase=True)
vectorizer = CountVectorizer(
            stop_words = russian_stop
            #,max_df=10
            #,min_df=9
            )
data_title_vectorized = vectorizer.fit_transform(corpusTitle)
data_desc_vectorized = vectorizer.fit_transform(corpusDesc)


Topic Modeling!


In [14]:
start_topic=time.time()
# Build a Latent Dirichlet Allocation Model
#lda_title_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
#lda_title_Z = lda_title_model.fit_transform(data_title_vectorized)
#end_titile_lda=time.time()
#print("LDA title Runtime: %0.2f Minutes"%((end_title_lda - start_topic)/60))
#print(lda_title_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Dirichlet Allocation Model
#lda_desc_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
#lda_desc_Z = lda_desc_model.fit_transform(data_desc_vectorized)
#end_desc_lda=time.time()
#print("LDA desc Runtime: %0.2f Minutes"%((end_desc_lda - start_topic)/60))
#print(lda_desc_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
    
# ----------------NMF______________________
# Build a Non-Negative Matrix Factorization Model
nmf_title_model = NMF(n_components=NUM_TOPICS)
nmf_title_Z = nmf_title_model.fit_transform(data_title_vectorized)
end_title_nmf=time.time()
print("NMF title Runtime: %0.2f Minutes"%((end_title_nmf - start_topic)/60))
print(nmf_title_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Non-Negative Matrix Factorization Model
nmf_desc_model = NMF(n_components=NUM_TOPICS)
nmf_desc_Z = nmf_desc_model.fit_transform(data_desc_vectorized)
end_desc_nmf=time.time()
print("NMF desc Runtime: %0.2f Minutes"%((end_desc_nmf - start_topic)/60))
print(nmf_desc_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
    
# ----------------LSI______________________
# Build a Latent Semantic Indexing Model
lsi_title_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_title_Z = lsi_title_model.fit_transform(data_title_vectorized)
end_title_lsi=time.time()
print("LSI title Runtime: %0.2f Minutes"%((end_title_lsi - start_topic)/60))

lsi_desc_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_desc_Z = lsi_desc_model.fit_transform(data_desc_vectorized)
end_desc_lsi=time.time()
print("LSI desc Runtime: %0.2f Minutes"%((end_desc_lsi - start_topic)/60))

NMF title Runtime: 80.20 Minutes
(2011862, 150)
NMF desc Runtime: 473.53 Minutes
(2011862, 150)
LSI title Runtime: 477.62 Minutes
LSI desc Runtime: 485.40 Minutes


In [15]:
gc.collect()

0

In [35]:
df_text_split.reset_index(inplace=True)
df_text_split.head()

Unnamed: 0,item_id,title_split,desc_split
0,b912c3c6a6ad,кокоби кокон для сна,кокон для сна малыша пользовались меньше месяц...
1,2dac0150717d,стойка для одежды,стойка для одежды под вешалки с бутика
2,ba83aefab5dc,philips bluray,в хорошем состоянии домашний кинотеатр с blu ...
3,02996f1dd2ea,автокресло,продам кресло от0-25кг
4,7c90be56d2ab,ваз 2110 2003,все вопросы по телефону


## LSI

In [16]:
header_title_lsi = []
header_desc_lsi = []
for ele in range(NUM_TOPICS):
    name_title_header = 'topic_title_lsi_' + str(ele)
    name_desc_header= 'topic_desc_lsi_' + str(ele)
    header_title_lsi.append(name_title_header)
    header_desc_lsi.append(name_desc_header)

### raw

In [17]:
df_title_lsi_raw = pd.DataFrame(lsi_title_Z, columns=header_title_lsi)
df_desc_lsi_raw = pd.DataFrame(lsi_desc_Z, columns=header_desc_lsi)

In [32]:
df_title_lsi_raw.head()

Unnamed: 0,topic_title_lsi_0,topic_title_lsi_1,topic_title_lsi_2,topic_title_lsi_3,topic_title_lsi_4,topic_title_lsi_5,topic_title_lsi_6,topic_title_lsi_7,topic_title_lsi_8,topic_title_lsi_9,...,topic_title_lsi_140,topic_title_lsi_141,topic_title_lsi_142,topic_title_lsi_143,topic_title_lsi_144,topic_title_lsi_145,topic_title_lsi_146,topic_title_lsi_147,topic_title_lsi_148,topic_title_lsi_149
0,4.46695e-07,1e-06,0.000205,-3.1e-05,7.7e-05,2e-06,9.208965e-05,0.000281,4e-05,-8.128158e-07,...,-0.00023,-0.000141,0.00034,-9.3e-05,-0.000152,4.2e-05,7.5e-05,4.7e-05,0.000138,-0.000148
1,1.461599e-05,2.5e-05,0.001028,-4.6e-05,0.000703,3.8e-05,0.0007766625,0.000717,0.000709,-5.242536e-05,...,-0.011297,-0.005327,0.004596,0.013403,-0.008797,0.041857,0.008435,-0.005928,0.022667,0.008257
2,2.259193e-05,4.5e-05,0.001152,-0.000274,6.6e-05,8e-05,0.0002500923,0.000254,0.000673,-0.0003941915,...,-0.01983,0.003019,0.004891,-0.002084,-0.000401,-0.031497,-0.019775,-0.009301,0.012202,0.012413
3,2.571375e-05,4e-05,0.001875,-0.000137,-1.4e-05,6.8e-05,0.0002133406,-0.000236,6e-05,-0.0002694031,...,0.001331,-0.000614,0.059375,-0.004363,0.004447,-0.021,-0.032003,-0.015284,0.016629,-0.030513
4,2.96344e-06,6e-06,0.000165,-4.2e-05,8e-06,4.5e-05,9.6742e-07,3.1e-05,9.9e-05,-0.0001558822,...,0.017734,-0.052789,-0.072579,0.005977,-0.06338,-0.016005,-0.02395,-0.050888,-0.038587,-0.058424


In [36]:
df_text_split_lsi_title_raw = pd.concat([df_text_split,df_title_lsi_raw], axis=1, join_axes=[df_text_split.index])
df_text_split_lsi_desc_raw = pd.concat([df_text_split,df_desc_lsi_raw], axis=1, join_axes=[df_text_split.index])

In [37]:
df_text_split_lsi_title_raw.head()

Unnamed: 0,item_id,title_split,desc_split,topic_title_lsi_0,topic_title_lsi_1,topic_title_lsi_2,topic_title_lsi_3,topic_title_lsi_4,topic_title_lsi_5,topic_title_lsi_6,...,topic_title_lsi_140,topic_title_lsi_141,topic_title_lsi_142,topic_title_lsi_143,topic_title_lsi_144,topic_title_lsi_145,topic_title_lsi_146,topic_title_lsi_147,topic_title_lsi_148,topic_title_lsi_149
0,b912c3c6a6ad,кокоби кокон для сна,кокон для сна малыша пользовались меньше месяц...,4.46695e-07,1e-06,0.000205,-3.1e-05,7.7e-05,2e-06,9.208965e-05,...,-0.00023,-0.000141,0.00034,-9.3e-05,-0.000152,4.2e-05,7.5e-05,4.7e-05,0.000138,-0.000148
1,2dac0150717d,стойка для одежды,стойка для одежды под вешалки с бутика,1.461599e-05,2.5e-05,0.001028,-4.6e-05,0.000703,3.8e-05,0.0007766625,...,-0.011297,-0.005327,0.004596,0.013403,-0.008797,0.041857,0.008435,-0.005928,0.022667,0.008257
2,ba83aefab5dc,philips bluray,в хорошем состоянии домашний кинотеатр с blu ...,2.259193e-05,4.5e-05,0.001152,-0.000274,6.6e-05,8e-05,0.0002500923,...,-0.01983,0.003019,0.004891,-0.002084,-0.000401,-0.031497,-0.019775,-0.009301,0.012202,0.012413
3,02996f1dd2ea,автокресло,продам кресло от0-25кг,2.571375e-05,4e-05,0.001875,-0.000137,-1.4e-05,6.8e-05,0.0002133406,...,0.001331,-0.000614,0.059375,-0.004363,0.004447,-0.021,-0.032003,-0.015284,0.016629,-0.030513
4,7c90be56d2ab,ваз 2110 2003,все вопросы по телефону,2.96344e-06,6e-06,0.000165,-4.2e-05,8e-06,4.5e-05,9.6742e-07,...,0.017734,-0.052789,-0.072579,0.005977,-0.06338,-0.016005,-0.02395,-0.050888,-0.038587,-0.058424


In [38]:
df_text_split_lsi_title_raw.drop(["title_split", "desc_split"],axis=1, inplace=True)
df_text_split_lsi_desc_raw.drop(["title_split", "desc_split"],axis=1, inplace=True)

In [39]:
df_text_split_lsi_title_raw.to_csv("topic_feature_lsi_title_150_raw.csv")
df_text_split_lsi_desc_raw.to_csv("topic_feature_lsi_desc_150_raw.csv")

In [44]:
gc.collect()
del df_text_split_lsi_title_raw, df_text_split_lsi_desc_raw

### standardization

In [45]:
lsi_title_mean =  lsi_title_Z.mean(axis=None, keepdims=True)
lsi_title_std =  np.std(lsi_title_Z, axis=None, keepdims=True)
lsi_title_Z_standardized = (lsi_title_Z - lsi_title_mean)/lsi_title_std 

lsi_desc_mean =  lsi_desc_Z.mean(axis=None, keepdims=True)
lsi_desc_std =  np.std(lsi_desc_Z, axis=None, keepdims=True)
lsi_desc_Z_standardized = (lsi_desc_Z - lsi_desc_mean)/lsi_desc_std 

In [46]:
df_title_lsi_std = pd.DataFrame(lsi_title_Z_standardized, columns=header_title_lsi)
df_desc_lsi_std = pd.DataFrame(lsi_desc_Z_standardized, columns=header_desc_lsi)

In [47]:
df_desc_lsi_std.head()

Unnamed: 0,topic_desc_lsi_0,topic_desc_lsi_1,topic_desc_lsi_2,topic_desc_lsi_3,topic_desc_lsi_4,topic_desc_lsi_5,topic_desc_lsi_6,topic_desc_lsi_7,topic_desc_lsi_8,topic_desc_lsi_9,...,topic_desc_lsi_140,topic_desc_lsi_141,topic_desc_lsi_142,topic_desc_lsi_143,topic_desc_lsi_144,topic_desc_lsi_145,topic_desc_lsi_146,topic_desc_lsi_147,topic_desc_lsi_148,topic_desc_lsi_149
0,0.575693,-0.200597,0.147491,-0.065854,-0.246037,-0.308027,0.213128,0.144641,-0.06297,-0.454792,...,0.114932,0.19969,-0.055556,0.153896,0.158628,0.126903,-0.046211,-0.025009,0.027029,-0.054844
1,0.043156,-0.00757,-0.004664,-0.012814,-0.004345,-0.021396,0.03796,0.013845,-0.007403,-0.024744,...,0.002313,0.01577,0.003198,-0.001928,-0.039729,0.04514,-0.040571,-0.00576,-0.069099,0.030218
2,2.456613,-2.997672,-2.277076,2.783959,-1.591286,2.39797,-0.476348,-0.476099,-0.634329,-0.28607,...,-0.165793,-0.098852,0.397577,0.252602,-0.571445,-0.63905,0.164636,0.613233,-0.464188,-0.051101
3,0.966323,-0.929918,-0.754909,0.862697,0.149734,-1.951043,-0.712723,-1.022977,3.052847,0.799691,...,-0.083661,-0.046769,-0.025853,-0.03994,0.075409,-0.101272,0.066854,0.036339,-0.049221,0.001487
4,0.324021,-0.112061,-0.228051,-0.190532,-0.080047,0.03009,0.526186,-0.219282,0.187206,-0.008575,...,-0.112104,-0.203921,0.133356,-0.067169,0.055804,-0.221666,-0.071042,0.027818,-0.048152,0.038734


In [48]:
df_text_split_lsi_title_std = pd.concat([df_text_split,df_title_lsi_std], axis=1, join_axes=[df_text_split.index])
df_text_split_lsi_desc_std = pd.concat([df_text_split,df_desc_lsi_std], axis=1, join_axes=[df_text_split.index])

In [49]:
df_text_split_lsi_title_std.drop(["title_split", "desc_split"],axis=1, inplace=True)
df_text_split_lsi_desc_std.drop(["title_split", "desc_split"],axis=1, inplace=True)

In [50]:
df_text_split_lsi_title_std.to_csv("topic_feature_lsi_title_150_std.csv")
df_text_split_lsi_desc_std.to_csv("topic_feature_lsi_desc_150_std.csv")

In [51]:
gc.collect()
del df_text_split_lsi_title_std, df_text_split_lsi_desc_std

## NMF

In [52]:
header_title_nmf = []
header_desc_nmf = []
for ele in range(NUM_TOPICS):
    name_title_header = 'topic_title_nmf_' + str(ele)
    name_desc_header= 'topic_desc_nmf_' + str(ele)
    header_title_nmf.append(name_title_header)
    header_desc_nmf.append(name_desc_header)

### raw

In [53]:
df_title_nmf_raw = pd.DataFrame(nmf_title_Z, columns=header_title_nmf)
df_desc_nmf_raw = pd.DataFrame(nmf_desc_Z, columns=header_desc_nmf)

In [54]:
df_title_nmf_raw.head()

Unnamed: 0,topic_title_nmf_0,topic_title_nmf_1,topic_title_nmf_2,topic_title_nmf_3,topic_title_nmf_4,topic_title_nmf_5,topic_title_nmf_6,topic_title_nmf_7,topic_title_nmf_8,topic_title_nmf_9,...,topic_title_nmf_140,topic_title_nmf_141,topic_title_nmf_142,topic_title_nmf_143,topic_title_nmf_144,topic_title_nmf_145,topic_title_nmf_146,topic_title_nmf_147,topic_title_nmf_148,topic_title_nmf_149
0,0.0,0.0,6e-06,0.0,0.0,0.0,2e-06,0.0,6e-06,0.0,...,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,6e-06,0.0,0.0,0.0,1e-06,0.0,0.0,0.0,...,2.5e-05,0.0,0.000182,0.0,0.0,0.0,0.000111,0.0,0.0,0.0
2,0.0,0.0,7e-06,0.0,0.0,0.0,1e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00154,0.0,0.000507,0.0,0.0
3,0.0,0.0,6.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000998,0.0,0.0,0.000247,0.0,0.0,0.0,0.0,0.0


In [55]:
df_text_split_nmf_title_raw = pd.concat([df_text_split,df_title_nmf_raw], axis=1, join_axes=[df_text_split.index])
df_text_split_nmf_desc_raw = pd.concat([df_text_split,df_desc_nmf_raw], axis=1, join_axes=[df_text_split.index])

In [56]:
df_text_split_nmf_title_raw.head()
df_text_split_nmf_desc_raw.head()

Unnamed: 0,item_id,title_split,desc_split,topic_desc_nmf_0,topic_desc_nmf_1,topic_desc_nmf_2,topic_desc_nmf_3,topic_desc_nmf_4,topic_desc_nmf_5,topic_desc_nmf_6,...,topic_desc_nmf_140,topic_desc_nmf_141,topic_desc_nmf_142,topic_desc_nmf_143,topic_desc_nmf_144,topic_desc_nmf_145,topic_desc_nmf_146,topic_desc_nmf_147,topic_desc_nmf_148,topic_desc_nmf_149
0,b912c3c6a6ad,кокоби кокон для сна,кокон для сна малыша пользовались меньше месяц...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000128,0.0,0.00107
1,2dac0150717d,стойка для одежды,стойка для одежды под вешалки с бутика,0.0,1.5e-05,0.0,0.0,0.0,0.00015,0.000413,...,0.0,0.001162,0.000867,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ba83aefab5dc,philips bluray,в хорошем состоянии домашний кинотеатр с blu ...,0.004159,0.000118,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4e-05
3,02996f1dd2ea,автокресло,продам кресло от0-25кг,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7c90be56d2ab,ваз 2110 2003,все вопросы по телефону,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
df_text_split_nmf_title_raw.drop(["title_split", "desc_split"],axis=1, inplace=True)
df_text_split_nmf_title_raw.to_csv("topic_feature_nmf_title_150_raw.csv")

df_text_split_nmf_desc_raw.drop(["title_split", "desc_split"],axis=1, inplace=True)
df_text_split_nmf_desc_raw.to_csv("topic_feature_nmf_desc_150_raw.csv")

### standardization

In [None]:
#start_time_norm = time.time()
#for idx in range(len(lsi_title_Z)):
    #lsi_title_Z[idx] = lsi_title_Z[idx]/sum(lsi_title_Z[idx])
    #lsi_desc_Z[idx] = lsi_desc_Z[idx]/sum(lsi_desc_Z[idx])
    #arr = np.append(arr, tmp, axis=0)
    #if idx% 100000 == 0:
    #    print(idx)
   #     print("LSI norm Runtime: %0.2f Minutes"%((time.time() - start_time_norm)/60))
   #     gc.collect()
        
