# Create the features DF
* using by_postal_code dataset

# Import

In [23]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import pickle
import math 
from sklearn.model_selection import train_test_split

from scipy import sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [24]:
RANDOM_STATE = 24

In [32]:
res_df = pd.read_pickle ('../data/ORI_by_postal_code_s_dropped.pkl')
postal_code_feature_dict = pickle.load(open("../data/postal_dict.pkl", "rb"))
all_df = pd.read_pickle('../Data/restaurant_only.pkl')

In [28]:
print(res_df.shape , all_df.shape)

(1848, 5) (387723, 20)


In [29]:
len(Counter(res_df.name))

405

In [9]:
len(postal_code_feature_dict)

750

In [10]:
# create relevance score
new_df = pd.DataFrame()
for i in Counter(res_df.name):
    tmp = res_df[res_df.name==i].sort_values(by=['review_count'] ,ascending=False)
    tmp['relevance']=''
    score = 12
    for idx , row in tmp.iterrows():
        tmp.relevance[idx] = score
        score -=1
    new_df = pd.concat([new_df,tmp])

# check the shape
if new_df.shape[0] != res_df.shape[0]:
    print(f'There is a mistake creating the relevance score')
else:
    print(f'Relevance score added')
res_df = new_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Relevance score added


In [None]:
# unhash if necessary
# create a overall postal_code features
# postal code and feature dict
postal_code_feature_dict = {}

for postal in Counter(all_df.postal_code):
    
    postal_code_feature_dict[postal]={}
    tmp = all_df[all_df.postal_code == postal].reset_index(drop=True)
    
    # density 
    postal_code_feature_dict[postal]['density'] = len(Counter(tmp.name))
    
    # neighborhood_entropy
    entropy_sum = 0
    for category in Counter(tmp.categories):
        entropy_sum+=(len(Counter(tmp[tmp.categories==category].name))/len(Counter(tmp.name)))\
        *np.log(len(Counter(tmp[tmp.categories==category].name))/len(Counter(tmp.name)))
    postal_code_feature_dict[postal]['entropy'] = -entropy_sum
    
    # area popularity
    postal_code_feature_dict[postal]['area_pop'] = len(tmp)
pickle.dump(postal_code_feature_dict, open("../data/postal_dict.pkl" , 'wb'))

In [12]:
# # # to check the features
# postal_df = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
# postal_df
# # Counter(postal_df.complementary)

# Train Test Split with Feature Engineering

In [13]:
def create_new (new ,category, all_df, postal_df , res_name,postal):
    
    new = new.drop(columns=['density', 'entropy','area_pop'])
    
    new['postal_code'] = postal
    new = pd.DataFrame(new.merge(postal_df, on='postal_code', how='left'))
    new['relevance'] = 0
    # add competitiveness
    new['competitiveness']=''
    place = all_df[all_df.postal_code == postal].reset_index(drop=True)
    new['competitiveness'] = -(len(Counter(place[place.categories == category].name))\
                                  /len(Counter(place.name)))
    return new

## Pointwise

In [19]:
# # POINTWISE v3
# 每間正樣本 都取 9個負樣本
# # split train test 

train_df = pd.DataFrame()
test_df = pd.DataFrame()

postal_df = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
postal_df = postal_df.sort_values(['postal_code'],ascending = [True]).reset_index(drop=True)

cnt = 0 

for i in Counter(res_df.name):
    
    if cnt%10 == 0:
        print(f'Now progress ... {cnt}')
    cnt +=1
    
    tmp = res_df[res_df.name==i].sort_values(['postal_code'],ascending = [False]).reset_index(drop=True)
    category = tmp.categories[0]
    
    postal_codes_list = list(postal_df.postal_code)
    
    # remove the known postal codes
    for postal in Counter(tmp.postal_code):
        postal_codes_list.remove(postal)

    # merge with location features
    tmp = pd.DataFrame(tmp.merge(postal_df, on='postal_code', how='left')).reset_index(drop=True)
    
    
    # add competitiveness and review 
    tmp['competitiveness'] = ''
    
    for j in range(len(tmp)):
        
        # competitiveness
        place = all_df[all_df.postal_code == tmp['postal_code'][j]]
        tmp['competitiveness'][j] = -(len(Counter(place[place.categories == category].name))/len(Counter(place.name)))

    tmp_train , tmp_test = train_test_split(tmp, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
    
    # Train
    neg_df = pd.DataFrame()
    for index, row in tmp_train.iterrows():
        cnt = 0 
        category = row.categories

        # add the negative samples
        for postal in postal_codes_list:
            if cnt <9:
                if postal in postal_codes_list and\
                abs(int(postal)- int(row['postal_code'])) <= 500:
                    cnt+=1
                    new = pd.DataFrame(row).T
                    new = create_new(new ,category, all_df, postal_df , i,postal)
                    neg_df = pd.concat([neg_df ,new ])
            else:
                break
    tmp_train = pd.concat([tmp_train ,neg_df ])
    
    # Test
    neg_df = pd.DataFrame()
    for index, row in tmp_test.iterrows():
        cnt = 0 
        category = row.categories

        # add the negative samples
        for postal in postal_codes_list:
            if cnt <9:
                if postal in postal_codes_list and\
                abs(int(postal)- int(row['postal_code'])) <= 500:
                    cnt+=1
                    new = pd.DataFrame(row).T
                    new = create_new(new ,category, all_df, postal_df , i,postal)
                    neg_df = pd.concat([neg_df ,new ])
            else:
                break
    tmp_test = pd.concat([tmp_test ,neg_df ])
    
    if len(tmp_test) + len(tmp_train) < len(tmp)*3:
        print(f'There is sth wrong with ... {i}')
        break
    
    train_df = pd.concat([train_df,tmp_train])
    test_df = pd.concat([test_df,tmp_test])
print(f'Finished constructing....  train res cnt  : {len(Counter(train_df.name))} test res cnt : {len(Counter(test_df.name))}')

Now progress ... 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Finished constructing....  train res cnt  : 405 test res cnt : 405


In [20]:
train_df.shape , test_df.shape

((11328, 10), (7101, 10))

In [16]:
for i in ['density', 'entropy', 'competitiveness','area_pop', 'relevance']:
    train_df[i] = train_df[i].astype('float')
    test_df[i] = test_df[i].astype('float')
train_df.to_pickle('../data/Train_by_postoal_code_without_review_pointwise_v3_3.pkl')
test_df.to_pickle('../data/Test_by_postoal_code_without_review_pointwise_v3_3.pkl')