# Create the features DF
* using by_postal_code dataset

# Import

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import math 
from sklearn.model_selection import train_test_split

from scipy import sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [2]:
RANDOM_STATE = 24

In [3]:
res_df = pd.read_pickle ('/home/adam/Steph_C/my_thesis/data/ORI_by_postal_code_s_dropped.pkl').reset_index(drop=True)
venue_df = pd.read_pickle ('../Data/yelp/other_venues.pkl').reset_index(drop=True)
all_df = pd.read_pickle('../Data/yelp/restaurant_only.pkl')

In [4]:
print(res_df.shape , venue_df.shape , all_df.shape)

(192547, 20) (700010, 20) (2112553, 20)


In [5]:
# drop duplicates
df = pd.concat([all_df,venue_df]) # for check in calculation
res_df = res_df.drop_duplicates(subset=['name','postal_code']).reset_index(drop=True)
venue_df = venue_df.drop_duplicates(subset=['name','postal_code']).reset_index(drop=True)
all_df = all_df.drop_duplicates(subset=['name','postal_code']).reset_index(drop=True)

In [6]:
# drop postal_codes not in U.S
drop_postal = [i for i in range(len(all_df)) if not all_df.postal_code[i].isdigit()]
all_df = all_df.drop(drop_postal).reset_index(drop=True)

In [7]:
print(res_df.shape , venue_df.shape , df.shape , all_df.shape)

(1848, 20) (14791, 20) (2812563, 20) (19901, 20)


In [8]:
len(Counter(res_df.name))

405

In [9]:
# create relevance score
cnt = 0
new_df = pd.DataFrame()
for i in Counter(res_df.name):
    tmp = res_df[res_df.name==i].reset_index(drop=True)
    tmp['relevance']=''
    score = 12
    for j in range(len(tmp)):
        tmp['relevance'][j]=score
        score -=1
    new_df = pd.concat([new_df,tmp])

# check the shape
if new_df.shape[0] != res_df.shape[0]:
    print(f'There is a mistake creating the relevance score')
else:
    print(f'Relevance score added')
res_df = new_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Relevance score added


In [10]:
# create a overall postal_code features
# postal code and feature dict
postal_code_feature_dict = {}

for postal in Counter(all_df.postal_code):
    
    postal_code_feature_dict[postal]={}
    tmp = df[df.postal_code == postal].reset_index(drop=True)
    tmp_venue = venue_df[venue_df == postal].reset_index(drop=True)
    
    # density 
    postal_code_feature_dict[postal]['density'] = len(Counter(tmp.name))
    
    # neighborhood_entropy
    entropy_sum = 0
    for category in Counter(tmp.categories):
        entropy_sum+=(len(Counter(tmp[tmp.categories==category].name))/len(Counter(tmp.name)))\
        *np.log(len(Counter(tmp[tmp.categories==category].name))/len(Counter(tmp.name)))
    postal_code_feature_dict[postal]['entropy'] = -entropy_sum
    
    # area popularity
    postal_code_feature_dict[postal]['area_pop'] = len(tmp)
    
    # traffic accessibility + complementary
    transportation_cnt = 0
    dep_cnt = 0
    parking_cnt =0
    
    store_cnt = len(Counter(tmp_venue.name))
    
    for i in range(len(tmp_venue)):
        try:
            if 'Public Transportation' in tmp_venue.categories[i]:
                transportation_cnt+=1
            if 'Department Stores' in tmp_venue.categories[i]:
                dep_cnt+=1
            if 'Parking' in tmp_venue.categories[i]:
                parking_cnt+=1
        except:
            pass
    
    postal_code_feature_dict[postal]['accessibility'] = transportation_cnt
    if store_cnt >1:
        postal_code_feature_dict[postal]['complementary'] = (2*(dep_cnt+parking_cnt) )/(store_cnt*(store_cnt-1)) 
    else:
        postal_code_feature_dict[postal]['complementary']=0

In [11]:
len(postal_code_feature_dict)

750

In [12]:
# # # to check the features
# postal_df = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
# postal_df
# # Counter(postal_df.complementary)

# Train Test Split with Feature Engineering

In [13]:
def create_new (new ,category,postal, all_df, postal_df):
    
    new = new.drop(columns=['density', 'entropy','area_pop', 'accessibility', 'complementary'])
    new['postal_code'] = postal
    new = pd.DataFrame(new.merge(postal_df, on='postal_code', how='left'))
    new['relevance'] = 0
    # add competitiveness
    new['competitiveness']=''
    place = all_df[all_df.postal_code == postal].reset_index(drop=True)
    new['competitiveness'] = -(len(Counter(place[place.categories == category].name))\
                                  /len(Counter(place.name)))
    # TO-DO Add reviews
    
    return new

## Pointwise

In [14]:
# # POINTWISE v1
# # split train test 
# # 拿每一個餐廳一半的分店當作 testing set (雖然大部分只有兩家分店)

# cnt = 0
# train_df = pd.DataFrame()
# test_df = pd.DataFrame()
# pos_train_post = Counter()
# neg_train_post ={}

# for i in Counter(res_df.name):
    
    
#     tmp = res_df[res_df.name==i]
#     tmp_train , tmp_test = train_test_split(tmp, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
    
#     postal_df = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
#     postal_df = postal_df.sort_values(['postal_code'],
#               ascending = [True]).reset_index(drop=True)
    
    
#     # merge with location features
#     tmp_train = pd.DataFrame(tmp_train.merge(postal_df, on='postal_code', how='left')).reset_index(drop=True)
#     tmp_test = pd.DataFrame(tmp_test.merge(postal_df, on='postal_code', how='left')).reset_index(drop=True)
    
#     pos_train_post += Counter(tmp_train.postal_code)
#     ori_train, ori_test = tmp_train.shape , tmp_test.shape
    
#     ############################## reconstruct ################################
#     postal_codes_list_train = list(postal_df.postal_code)
#     postal_codes_list_test= list(postal_df.postal_code)
    
    
#     for j in Counter(tmp_train.postal_code):
#         postal_codes_list_train.remove(j)
#         postal_codes_list_test.remove(j)
    
#     for j in Counter(tmp_test.postal_code):
#         postal_codes_list_train.remove(j)
#         postal_codes_list_test.remove(j)
    
    
#     ## train 
#     tmp_train['competitiveness'] =''
#     neg_df = pd.DataFrame()
    
#     for j in range(len(tmp_train)):
        
#         cnt =0 
#         category = tmp_train.categories[j]

        
#         # add competitiveness
#         place = all_df[all_df.postal_code == tmp_train['postal_code'][j]]
#         tmp_train['competitiveness'][j] = -(len(Counter(place[place.categories == category].name))/len(Counter(place.name)))
    
#         ## TO-DO Add reviews 
        
#         # add the negative samples
#         for k in range(len(postal_df)):
#             if cnt<2:
#                 if postal_df.postal_code[k] in postal_codes_list_train and\
#                 abs(int(postal_df.postal_code[k])- int(tmp_train['postal_code'][j])) <= 1000:
#                     cnt+=1
#                     new = pd.DataFrame(tmp_train.iloc[j]).T.reset_index(drop=True)
#                     new = create_new (new ,category, all_df, postal_df)
#                     neg_df = pd.concat([neg_df ,new ])
    
#                     ##### create a negative sample dict
#                     if postal_df.postal_code[k] in neg_train_post:
#                         neg_train_post[postal_df.postal_code[k]] +=1
#                     else:
#                         neg_train_post[postal_df.postal_code[k]]=1
#             else:
#                 break
    
#     tmp_train = pd.concat([tmp_train ,neg_df ])
    
#     ## test 
    
#     tmp_test['competitiveness'] =''
    
#     neg_df = pd.DataFrame()
#     for j in range(len(tmp_test)):
#         cnt =0
#         category = tmp_test.categories[j]
#         # add competitiveness
#         place = all_df[all_df.postal_code == tmp_test['postal_code'][j]]
#         tmp_test['competitiveness'][j] = -(len(Counter(place[place.categories == tmp_test.categories[j]].name))/len(Counter(place.name)))
        
#         # TO-DO add review 
        
#         # add negative samples
#         for k in range(len(postal_df)):
#             if cnt<2:
#                 if postal_df.postal_code[k] in postal_codes_list_test and\
#                 abs(int(postal_df.postal_code[k])-int(tmp_test['postal_code'][j])) <= 1000:
#                     cnt+=1
#                     new = pd.DataFrame(tmp_test.iloc[j]).T.reset_index(drop=True)
#                     new = create_new (new ,category, all_df, postal_df)
#                     neg_df = pd.concat([neg_df ,new ])
#             else:
#                 break
                
#     tmp_test = pd.concat([tmp_test ,neg_df ])   
#     if int(tmp_train.shape[0]) != int(ori_train[0]*3) or int(tmp_test.shape[0]) != int(ori_test[0]*3):
#         print('Original: ' ,ori_train, ori_test )
#         print('After : ',tmp_train.shape[0] , tmp_test.shape[0])
#         print(f'There is sth wrong !!!!!!!!')
#         break
    
#     train_df =  pd.concat([train_df ,tmp_train ])
#     test_df =  pd.concat([test_df ,tmp_test ])
# print(f'Finished constructing....  train : {len(Counter(train_df.name))} test : {len(Counter(test_df.name))}')


In [15]:
# # # POINTWISE v2
# # # 取每間餐廳分店中郵遞區號最大，取其 +-500 的所有地區 再去切 train test
# # # split train test 
# train_df = pd.DataFrame()
# test_df = pd.DataFrame()

# for i in Counter(res_df.name):
    
#     tmp = res_df[res_df.name==i].sort_values(['postal_code'],ascending = [False]).reset_index(drop=True)
#     max_postal = max(tmp.postal_code.astype(int))
#     category = tmp.categories[0]
#     postal_df = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
#     postal_df = postal_df.sort_values(['postal_code'],ascending = [True]).reset_index(drop=True)
#     postal_codes_list = list(postal_df.postal_code)
    
#     # remove the known postal codes
#     for postal in Counter(tmp.postal_code):
#         postal_codes_list.remove(postal)

#     # merge with location features
#     tmp = pd.DataFrame(tmp.merge(postal_df, on='postal_code', how='left')).reset_index(drop=True)
    
#     # add competitiveness and review (TO-DO)
#     tmp['competitiveness']= ''
# #     tmp['review']= ''
#     for j in range(len(tmp)):
        
#         # competitiveness
#         place = all_df[all_df.postal_code == tmp['postal_code'][j]]
#         tmp['competitiveness'][j] = -(len(Counter(place[place.categories == category].name))/len(Counter(place.name)))

    
#     # create negative 
#     neg_df = pd.DataFrame()
#     cnt = 0 
    
#     for j in range(len(postal_df)):
#         if cnt > 20 :
#             break
#         if postal_df.postal_code[j] in postal_codes_list and \
#         abs(max_postal - int(postal_df.postal_code[j])) <= 500:
#             cnt+=1
#             new = pd.DataFrame(tmp.iloc[0]).T.reset_index(drop=True)
#             postal_codes_list.remove(postal_df.postal_code[j])
#             new = create_new (new ,category, all_df, postal_df)
#             neg_df = pd.concat([neg_df ,new ])

#     if len(neg_df)<3:
#         print(f'Something is wrong with the shape of the neg sample for {i}')
#         break
    
#     # train test split
#     tmp_train , tmp_test = train_test_split(tmp, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
#     neg_train , neg_test = train_test_split(neg_df, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
    
    
#     # merge with neg
#     tmp_train = pd.concat([tmp_train,neg_train])
#     tmp_test = pd.concat([tmp_test,neg_test])
    
#     train_df = pd.concat([train_df,tmp_train])
#     test_df = pd.concat([test_df,tmp_test])


In [71]:
# # POINTWISE v3
# 2-4 間分店用方法一取負樣本，其他參照方法二（解決2-4間分店開在非常遙遠的地方）
# # split train test 

train_df = pd.DataFrame()
test_df = pd.DataFrame()
wrong_list =[]

for i in Counter(res_df.name):
    
    category = tmp.categories[0]
    
    tmp = res_df[res_df.name==i].sort_values(['postal_code'],ascending = [False]).reset_index(drop=True)
    postal_df = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
    postal_df = postal_df.sort_values(['postal_code'],ascending = [True]).reset_index(drop=True)
    postal_codes_list = list(postal_df.postal_code)
    
    # remove the known postal codes
    for postal in Counter(tmp.postal_code):
        postal_codes_list.remove(postal)

    # merge with location features
    tmp = pd.DataFrame(tmp.merge(postal_df, on='postal_code', how='left')).reset_index(drop=True)
    
    # add competitiveness and review (TO-DO)
    tmp['competitiveness']= ''
#     tmp['review']= ''
    for j in range(len(tmp)):
        
        # competitiveness
        place = all_df[all_df.postal_code == tmp['postal_code'][j]]
        tmp['competitiveness'][j] = -(len(Counter(place[place.categories == category].name))/len(Counter(place.name)))
        
        # TO-DO add review

    # 看差距過大的店家
    if max(tmp.postal_code.astype(int))- min(tmp.postal_code.astype(int)) > 1000:

        # train test split
        tmp_train , tmp_test = train_test_split(tmp, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
        tmp_train = tmp_train.reset_index(drop=True)
        tmp_test = tmp_test.reset_index(drop=True)
        
        ## train 
        neg_df = pd.DataFrame()

        for j in range(len(tmp_train)):

            cnt =0 
            category = tmp_train.categories[j]

            # add the negative samples
            for k in range(len(postal_df)):
                if cnt<4:
                    if postal_df.postal_code[k] in postal_codes_list and\
                    abs(int(postal_df.postal_code[k])- int(tmp_train['postal_code'][j])) <= 500:
                        cnt+=1
                        new = pd.DataFrame(tmp_train.iloc[j]).T.reset_index(drop=True)
                        postal_codes_list.remove(postal_df.postal_code[k])
                        new = create_new (new ,category,postal_df.postal_code[k], all_df, postal_df)
                        neg_df = pd.concat([neg_df ,new ])
                else:
                    break
        
        if len(neg_df)< 4:
            wrong_list.append(4 - len(neg_df))
        tmp_train = pd.concat([tmp_train ,neg_df ])

        ## test 

        neg_df = pd.DataFrame()
        for j in range(len(tmp_test)):
            cnt =0
            category = tmp_test.categories[j]

            # add negative samples
            for k in range(len(postal_df)):
                if cnt<4:
                    if postal_df.postal_code[k] in postal_codes_list and\
                    abs(int(postal_df.postal_code[k])-int(tmp_test['postal_code'][j])) <= 500:
                        cnt+=1
                        new = pd.DataFrame(tmp_test.iloc[j]).T.reset_index(drop=True)
                        postal_codes_list.remove(postal_df.postal_code[k])
                        new = create_new (new ,category,postal_df.postal_code[k], all_df, postal_df)
                        neg_df = pd.concat([neg_df ,new ])
                else:
                    break
        if len(neg_df)< 4:
            wrong_list.append(4 - len(neg_df))
        tmp_test = pd.concat([tmp_test ,neg_df ])
        
    else:
        # create negative 
        neg_df = pd.DataFrame()
        cnt = 0 

        for j in range(len(postal_df)):
            if cnt > 4*len(tmp) :
                break
            if postal_df.postal_code[j] in postal_codes_list and \
            abs(max(tmp.postal_code.astype(int)) - int(postal_df.postal_code[j])) <= 500:
                cnt+=1
                new = pd.DataFrame(tmp.iloc[0]).T.reset_index(drop=True)
                new = new.drop(columns=['density', 'entropy','area_pop', 'accessibility', 'complementary'])
                new['postal_code'] = postal_df.postal_code[j]
                postal_codes_list.remove(postal_df.postal_code[j])
                new = pd.DataFrame(new.merge(postal_df, on='postal_code', how='left'))
                new['relevance'] = 0
                # add competitiveness
                new['competitiveness']=''
                place = all_df[all_df.postal_code == postal_df.postal_code[j]].reset_index(drop=True)
                new['competitiveness'] = -(len(Counter(place[place.categories ==category].name))/len(Counter(place.name)))

                # TO-DO add review 
                neg_df = pd.concat([neg_df ,new ])
        

        if len(neg_df)< 4*len(tmp):
            wrong_list.append(4*len(tmp) - len(neg_df))
        
        # train test split
        tmp_train , tmp_test = train_test_split(tmp, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
        neg_train , neg_test = train_test_split(neg_df, test_size=0.33, shuffle= True,random_state=RANDOM_STATE)
        # merge with neg
        tmp_train = pd.concat([tmp_train,neg_train])
        tmp_test = pd.concat([tmp_test,neg_test])

    train_df = pd.concat([train_df,tmp_train])
    test_df = pd.concat([test_df,tmp_test])
print(f'Finished constructing....  train res cnt  : {len(Counter(train_df.name))} test res cnt : {len(Counter(test_df.name))}')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Finished constructing....  train res cnt  : 405 test res cnt : 405


In [72]:
train_df.shape , test_df.shape

((5766, 27), (3473, 27))

In [73]:
print(len(wrong_list), np.mean(wrong_list))

16 9.9375


In [74]:
wrong_list

[5, 4, 20, 5, 1, 23, 5, 19, 20, 9, 10, 5, 20, 5, 4, 4]

In [75]:
for i in ['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary','relevance']:
    train_df[i] = train_df[i].astype('float')
    test_df[i] = test_df[i].astype('float')
train_df.to_pickle('/home/adam/Steph_C/my_thesis/data/Train_by_postoal_code_without_review_pointwise_v3_3.pkl')
test_df.to_pickle('/home/adam/Steph_C/my_thesis/data/Test_by_postoal_code_without_review_pointwise_v3_3.pkl')