In [42]:
import pandas as pd
import numpy as np
# speed up the loop
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
import pickle 

In [2]:
# read training data
train_data = pd.read_csv('train.csv')
# read testing data
test_data = pd.read_csv('test.csv')

In [3]:
# concat train and test data that part used for train and the rest used for validation
combined_data = pd.concat([train_data, test_data],axis=0)
combined_data.shape

(2011862, 18)

In [None]:
combined_data["image_top_1"].fillna(-999,inplace=True)

In [4]:
text_feature = ['param_1','param_2','param_3',"description", "title"]
for cols in text_feature:
    combined_data[cols] = combined_data[cols].astype(str) 
    combined_data[cols] = combined_data[cols].astype(str).fillna('NaN') # FILL NaN

In [6]:
# price feature
combined_data["price"] = np.log(combined_data["price"]+0.001)
combined_data["price"].fillna(combined_data.price.mean(),inplace=True)

In [8]:
# train_data convert for feature engineering 
# feature of day, weekday, and week (error)
train_data.activation_date = pd.to_datetime(train_data.activation_date)
train_data['day_of_month'] = train_data.activation_date.apply(lambda x: x.day)
train_data['day_of_week'] = train_data.activation_date.apply(lambda x: x.weekday())
#train_data['week_of_year'] = train_data.activation_date.apply(lambda x: x.week())
train_data['week_of_year'] = train_data.activation_date.dt.week


# converted for whole dataset
combined_data.activation_date = pd.to_datetime(combined_data.activation_date)
combined_data['day_of_month'] = combined_data.activation_date.apply(lambda x: x.day)
combined_data['day_of_week'] = combined_data.activation_date.apply(lambda x: x.weekday())
#combined_data['week_of_year'] = combined_data.activation_date.apply(lambda x: x.week())
combined_data['week_of_year'] = combined_data.activation_date.dt.week

In [12]:
category_column = ['region','city','parent_category_name','category_name','user_type','image_top_1','item_seq_number','day_of_month','day_of_week','week_of_year']
for item in tqdm(category_column):
        groupBy = train_data.groupby(item)['deal_probability']
        mean = groupBy.mean()
        std = groupBy.std()
        combined_data[item + '_deal_probability_mean'] = combined_data[item].map(mean)
        combined_data[item + '_deal_probability_mean'].fillna(0,inplace=True)
        combined_data[item + '_deal_probability_std'] = combined_data[item].map(std)
        combined_data[item + '_deal_probability_std'].fillna(0,inplace=True)


for item in tqdm(category_column):
        groupBy = train_data.groupby(item)['price']
        mean = groupBy.mean()
        std = groupBy.std()
        combined_data[item + '_price_mean'] = combined_data[item].map(mean)
        combined_data[item + '_price_mean'].fillna(0,inplace=True)
        combined_data[item + '_price_std'] = combined_data[item].map(std)
        combined_data[item + '_price_std'].fillna(0,inplace=True)

100%|██████████| 10/10 [00:05<00:00,  1.80it/s]
100%|██████████| 10/10 [00:04<00:00,  2.05it/s]


In [14]:
# Compute description tf-idf for whole dataset

# fill NaN as blank
combined_data['description'] = combined_data['description'].fillna(' ')
# test on different value of max_features
tfidf_vectorizer = TfidfVectorizer(max_features = 100,
                                   stop_words=stopwords.words('russian'))
tfidf_matrix = np.array(tfidf_vectorizer.fit_transform(combined_data['description']).todense(),dtype=np.float16)
# assign tf-idf into dataframe
for i in range(100):
    combined_data['tf_idf_' + str(i)] = tfidf_matrix[:,i]

In [15]:
# fill NaN as blank
combined_data['title'] = combined_data['title'].fillna(' ')
# test on different value of max_features
tfidf_vectorizer = TfidfVectorizer(max_features = 50,
                                   stop_words=stopwords.words('russian'))
tfidf_matrix = np.array(tfidf_vectorizer.fit_transform(combined_data['title']).todense(),dtype=np.float16)
# assign tf-idf into dataframe
for i in range(50):
    combined_data['tf_idf_title_' + str(i)] = tfidf_matrix[:,i]

In [16]:
# Encode labels with value between 0 and n-classes-1 for whole dataset
category_column_new = ['region','city','parent_category_name','category_name','user_type','param_1','param_2','param_3','image_top_1']
for item in category_column_new:
    combined_data[item].fillna('NaN')
    combined_data[item] = LabelEncoder().fit_transform(combined_data[item].astype(str)) 

In [19]:
# clean title and text by removing special characters
# fcn for cleaing words
def clean_word(text):
    try:
        textProc = text.lower()
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [31]:
combined_data['title'] = combined_data['title'].apply(lambda x: clean_word(x))
combined_data["description"]   = combined_data["description"].apply(lambda x: clean_word(x))

text_feature = ["description", "title"]

for cols in text_feature:
    combined_data[cols] = combined_data[cols].astype(str) 
    combined_data[cols] = combined_data[cols].astype(str).fillna('NaN') # FILL NaN
    combined_data[cols] = combined_data[cols].str.lower() 
    combined_data[cols + '_length'] = combined_data[cols].apply(lambda x: len(x.split()))
    combined_data[cols + '_length'].fillna(0,inplace=True)
    combined_data[cols + '_num_unique_words'] = combined_data[cols].apply(lambda x: len(set(w for w in x.split())))
    combined_data[cols + '_num_unique_words'].fillna(0,inplace=True)
    combined_data[cols + '_unique_percentage'] = combined_data[cols+'_num_unique_words'] / combined_data[cols+'_length'] * 100
    combined_data[cols + '_unique_percentage'].fillna(0,inplace=True)

In [22]:
import pickle 

def combine_dataframe(df1, df2):
    result = df1.append(df2)
    return result

def combine_feature(d1,d2):
    result = pd.concat([d1, d2],axis=1)
    return result

In [23]:
train_period = pd.read_pickle('period.p')
test_period = pd.read_pickle('test_period.p')
combined_period = combine_dataframe(train_period, test_period)

In [24]:
train_price_rank = pd.read_pickle('price_rank_train.p')
test_price_rank = pd.read_pickle('price_rank_test.p')
combined_price_rank = combine_dataframe(train_price_rank, test_price_rank)

In [25]:
train_IMG = pd.read_pickle('df_train')
test_IMG = pd.read_pickle('df_test')
combined_IMG = combine_dataframe(train_IMG, test_IMG)

In [32]:
result = pd.concat([combined_data, combined_period, combined_price_rank, combined_IMG], axis=1)

In [36]:
final_data = result.drop(['user_id', 'description', 'image', 'item_id', 'title'],axis=1)

In [34]:
#a = result.drop(['deal_probability','image'],axis=1)

In [30]:
#a.isnull().sum()

activation_date                                    0
category_name                                      0
city                                               0
description                                        0
image                                         155197
image_top_1                                        0
item_id                                            0
item_seq_number                                    0
param_1                                            0
param_2                                            0
param_3                                            0
parent_category_name                               0
price                                              0
region                                             0
title                                              0
user_id                                            0
user_type                                          0
day_of_month                                       0
day_of_week                                   

In [39]:
result.drop(['deal_probability','image'],axis=1).isnull().values.any()

False

In [37]:
final_data.isna().any()

activation_date                               False
category_name                                 False
city                                          False
deal_probability                               True
image_top_1                                   False
item_seq_number                               False
param_1                                       False
param_2                                       False
param_3                                       False
parent_category_name                          False
price                                         False
region                                        False
user_type                                     False
day_of_month                                  False
day_of_week                                   False
week_of_year                                  False
region_deal_probability_mean                  False
region_deal_probability_std                   False
city_deal_probability_mean                    False
city_deal_pr

In [40]:
final_data.to_pickle('final_data_V1.p')

In [43]:
X = final_data.loc[final_data.activation_date<=pd.to_datetime('2017-04-07')]
X_test = final_data.loc[final_data.activation_date>=pd.to_datetime('2017-04-08')]

Y = X.deal_probability
X = X.drop(['activation_date', 'deal_probability'], axis=1)
X_test = X_test.drop(['activation_date', 'deal_probability'], axis=1)

# split train and validation
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2)

In [44]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

In [45]:
# Fit a model by providing X and y from training set
clf.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [47]:
clf.score(X_train, Y_train)

0.19712851149796884

In [48]:
clf.score(X_validation, Y_validation)

0.19709747791083443

In [49]:
Y_train_predict = clf.predict(X_train)

In [52]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_train_predict, Y_train))

0.2330299656985343

In [53]:
Y_validat_predict = clf.predict(X_validation)

In [54]:
np.sqrt(mean_squared_error(Y_validat_predict, Y_validation))

0.2330762616375686