In [1]:
import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
#import nltk
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# LDA
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

NFOLDS = 5
SEED = 1234
VALID = True



In [2]:
def cleanName(text):
    try:
        textProc = text.lower()
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
    
    
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

In [3]:
print("\nData Load Stage")
training = pd.read_csv('/home/stanaya/.kaggle/competitions/avito-demand-prediction/train.csv', index_col = "item_id", parse_dates = ["activation_date"])
traindex = training.index
testing = pd.read_csv('/home/stanaya/.kaggle/competitions/avito-demand-prediction/test.csv', index_col = "item_id", parse_dates = ["activation_date"])
testdex = testing.index

ntrain = training.shape[0]
ntest = testing.shape[0]

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))


Data Load Stage
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns


In [4]:
# 学習データとテストデータを統合
print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

Combine Train and Test

All Data shape: 2011862 Rows, 16 Columns


In [5]:
print("\nText Features")

# Feature Engineering 

# Meta Text Features
textfeats = ["description", "title"]
# 句読点、括弧の数を特徴量にする
df['desc_punc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# タイトルと説明文を正規化する
df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))

# タイトルと説明文の全単語数とユニークな単語種類、さらにそれらの比を撮ったものを加える
for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words


Text Features


In [6]:
## 
df_text = df[["title", "description"]]

In [7]:
dic_replace_title = {',': ' ', '(':' ', ')': ' ', '.':' '}
dic_replace_desc = {',': ' ', '(':' ', ')': ' ', '.':' ', '\n': '', '\r': '', '/': ''}

def replaceTitleSomeCharSplit(s):
    return list(filter(None, s.translate(str.maketrans(dic_replace_title)).split(" ")))

def replaceDescSomeCharSplit(s):
    return list(filter(None, s.translate(str.maketrans(dic_replace_desc)).split(" ")))


def replaceTitleSomeChar(s):
    return s.translate(str.maketrans(dic_replace_title))

def replaceDescSomeChar(s):
    return s.translate(str.maketrans(dic_replace_desc))

In [8]:
df_text["title_split"] = df_text["title"].map(replaceTitleSomeChar)
df_text["desc_split"] = df_text["description"].map(replaceDescSomeChar)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
df_text_split = df_text[['title_split', 'desc_split']]

In [10]:
corpusTitle = df_text_split['title_split'].as_matrix()
corpusDesc = df_text_split['desc_split'].as_matrix()

In [11]:
print(len(corpusTitle))
print(len(corpusDesc))

2011862
2011862


In [12]:
gc.collect()

0

In [13]:
## Topic Model
print("\nTopic Modeling!")
NUM_TOPICS = 100
russian_stop = set(stopwords.words('russian'))
def get_col(col_name): return lambda x: x[col_name]

#vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                            stop_words='russian', lowercase=True)
vectorizer = CountVectorizer(
            stop_words = russian_stop
            #,max_df=10
            #,min_df=9
            )
data_title_vectorized = vectorizer.fit_transform(corpusTitle)
data_desc_vectorized = vectorizer.fit_transform(corpusDesc)


Topic Modeling!


In [14]:
features = vectorizer.get_feature_names()
print(len(features))
#print(features)

782920


In [15]:
## Topic Model
#print("\nTopic Modeling!")
#NUM_TOPICS = 10
#russian_stop = set(stopwords.words('russian'))
#def get_col(col_name): return lambda x: x[col_name]

 
#vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                            stop_words='russian', lowercase=True)
#vectorizer = CountVectorizer(
 #           ngram_range=(1, 2),
#            stop_words = russian_stop,
            #max_features=7000,
  #          preprocessor=get_col('title'))
#data_vectorized = vectorizer.fit_transform(df.to_dict('records'))

In [16]:
start_topic=time.time()
# Build a Latent Dirichlet Allocation Model
#lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
#lda_Z = lda_model.fit_transform(data_title_vectorized)
#end_lda=time.time()
#print("LDA Runtime: %0.2f Minutes"%((end_lda - start_topic)/60))
#print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
#nmf_model = NMF(n_components=NUM_TOPICS)
#nmf_Z = nmf_model.fit_transform(data_title_vectorized)
#end_nmf=time.time()
#print("NMF Runtime: %0.2f Minutes"%((end_nmf - start_topic)/60))
#print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_title_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_title_Z = lsi_title_model.fit_transform(data_title_vectorized)
end_title_lsi=time.time()
print("LSI title Runtime: %0.2f Minutes"%((end_title_lsi - start_topic)/60))

lsi_desc_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_desc_Z = lsi_desc_model.fit_transform(data_desc_vectorized)
end_desc_lsi=time.time()
print("LSI desc Runtime: %0.2f Minutes"%((end_desc_lsi - start_topic)/60))

LSI title Runtime: 2.53 Minutes
LSI desc Runtime: 7.42 Minutes


In [17]:
start_time_norm = time.time()
for idx in range(len(lsi_title_Z)):
    lsi_title_Z[idx] = lsi_title_Z[idx]/sum(lsi_title_Z[idx])
    lsi_desc_Z[idx] = lsi_desc_Z[idx]/sum(lsi_desc_Z[idx])
    #arr = np.append(arr, tmp, axis=0)
    if idx% 100000 == 0:
        print(idx)
        print("LSI norm Runtime: %0.2f Minutes"%((time.time() - start_time_norm)/60))
        gc.collect()
        


0
LSI norm Runtime: 0.00 Minutes


  after removing the cwd from sys.path.
  This is separate from the ipykernel package so we can avoid doing imports until


100000
LSI norm Runtime: 0.05 Minutes
200000
LSI norm Runtime: 0.09 Minutes
300000
LSI norm Runtime: 0.14 Minutes
400000
LSI norm Runtime: 0.19 Minutes
500000
LSI norm Runtime: 0.24 Minutes
600000
LSI norm Runtime: 0.28 Minutes
700000
LSI norm Runtime: 0.33 Minutes
800000
LSI norm Runtime: 0.38 Minutes
900000
LSI norm Runtime: 0.42 Minutes
1000000
LSI norm Runtime: 0.47 Minutes
1100000
LSI norm Runtime: 0.51 Minutes
1200000
LSI norm Runtime: 0.56 Minutes
1300000
LSI norm Runtime: 0.61 Minutes
1400000
LSI norm Runtime: 0.66 Minutes
1500000
LSI norm Runtime: 0.70 Minutes
1600000
LSI norm Runtime: 0.75 Minutes
1700000
LSI norm Runtime: 0.79 Minutes
1800000
LSI norm Runtime: 0.84 Minutes
1900000
LSI norm Runtime: 0.89 Minutes
2000000
LSI norm Runtime: 0.94 Minutes


In [18]:
header_title_lsi = []
header_desc_lsi = []
for ele in range(NUM_TOPICS):
    name_title_header = 'topic_title_lsi_' + str(ele)
    name_desc_header= 'topic_desc_lsi_' + str(ele)
    header_title_lsi.append(name_title_header)
    header_desc_lsi.append(name_desc_header)

df_title_lsi = pd.DataFrame(lsi_title_Z, columns=header_title_lsi)
df_desc_lsi = pd.DataFrame(lsi_desc_Z, columns=header_desc_lsi)

In [20]:
df_text_split = pd.concat([df_text_split,df_title_lsi], axis=1, join_axes=[df_text_split.index])
df_text_split = pd.concat([df_text_split,df_desc_lsi], axis=1, join_axes=[df_text_split.index])

In [23]:
df_text_split = df_text_split.reset_index()

In [24]:
df_text_split.drop("index",axis=1, inplace=True)

In [25]:
df_text_split.head()

Unnamed: 0,item_id,title_split,desc_split,topic_title_lsi_0,topic_title_lsi_1,topic_title_lsi_2,topic_title_lsi_3,topic_title_lsi_4,topic_title_lsi_5,topic_title_lsi_6,...,topic_desc_lsi_90,topic_desc_lsi_91,topic_desc_lsi_92,topic_desc_lsi_93,topic_desc_lsi_94,topic_desc_lsi_95,topic_desc_lsi_96,topic_desc_lsi_97,topic_desc_lsi_98,topic_desc_lsi_99
0,b912c3c6a6ad,кокоби кокон для сна,кокон для сна малыша пользовались меньше месяц...,4.1e-05,0.000135,0.018664,-0.002817,0.006994,0.000175,0.008364305,...,-0.320578,0.092686,-0.012933,0.006781,0.095726,-0.039784,0.024395,-0.008363,0.102374,0.159177
1,2dac0150717d,стойка для одежды,стойка для одежды под вешалки с бутика,0.000144,0.000248,0.010141,-0.000449,0.006938,0.000374,0.007666534,...,0.290425,-0.139372,0.321101,0.647477,-0.158279,0.187638,0.399396,0.04412,-0.27152,0.231213
2,ba83aefab5dc,philips bluray,в хорошем состоянии домашний кинотеатр с blu ...,0.000426,0.000846,0.021709,-0.005156,0.001238,0.00151,0.004733695,...,-0.096071,0.001399,-0.037947,0.019197,0.044987,-0.038936,-0.027052,0.044617,0.094011,0.045286
3,02996f1dd2ea,автокресло,продам кресло от0-25кг,0.000409,0.000637,0.029797,-0.002182,-0.00022,0.001087,0.003410935,...,0.117723,-0.074474,0.14615,0.003283,0.10473,0.289469,0.08007,-0.104985,-0.011115,0.075279
4,7c90be56d2ab,ваз 2110 2003,все вопросы по телефону,4e-06,9e-06,0.000223,-5.7e-05,9e-06,5.6e-05,4.392105e-07,...,-0.06062,0.108844,0.131563,0.044364,0.04647,0.065159,-0.030323,0.033243,0.170228,-0.031962


In [26]:
df_text_split.drop(["title_split", "desc_split"],axis=1, inplace=True)

In [27]:
df_text_split.to_csv("topic_feature_lsi.csv")