# Data Splitting

## Import 

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import pickle
import math 

In [2]:
df = pd.read_pickle('/home/adam/Steph_C/my_thesis/data/FE_by_postoal_code_without_review.pkl')

In [3]:
df.shape

(1450, 24)

In [4]:
df.columns

Index(['business_id', 'stars_x', 'useful', 'funny', 'cool', 'text', 'date',
       'name', 'address', 'city', 'state', 'postal_code', 'stars_y',
       'review_count', 'is_open', 'attributes', 'categories', 'hours',
       'density', 'entropy', 'competitiveness', 'area_pop', 'accessibility',
       'complementary'],
      dtype='object')

In [5]:
len(Counter(df.name))

566

In [6]:
# remove the restaurant that opens out of usa

for i in Counter(df.name):
    tmp = df[df.name==i]
    
    for j in Counter(tmp.postal_code):
        if not j.isdigit():
            print(f'Found it ... {i}')
            df = df[~df.name.isin(tmp.name)]
            break

Found it ... Cinnaholic
Found it ... Cactus Club Cafe


In [7]:
len(Counter(df.name))

564

## DF Construction
* add relevance

In [8]:
# sort by name and review count 
df = df.sort_values(['name', 'review_count'],
              ascending = [True, False]).reset_index(drop=True)

In [9]:
# create relevance score
cnt = 0
new_df = pd.DataFrame()
for i in Counter(df.name):
    tmp = df[df.name==i].reset_index(drop=True)
    tmp['relevance']=''
    score = 6
    for j in range(len(tmp)):
        tmp['relevance'][j]=score
        score -=1
    new_df = pd.concat([new_df,tmp])

# check the shape
if new_df.shape[0] != df.shape[0]:
    print(f'There is a mistake creating the relevance score')
else:
    print(f'Relevance score added')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Relevance score added


In [10]:
new_df = new_df.drop(columns=[ 'business_id','city','state', 'review_count','categories','stars_x', 'useful', 'funny', 'cool', 'text',
       'date','address','is_open', 'stars_y','attributes','hours'])

## Create Dataset

In [11]:
new_df.columns

Index(['name', 'postal_code', 'density', 'entropy', 'competitiveness',
       'area_pop', 'accessibility', 'complementary', 'relevance'],
      dtype='object')

In [12]:
# postal code and feature dict
postal_code_feature_dict = {}

for postal in Counter(df.postal_code):
    postal_code_feature_dict[postal]={}
    postal_code_feature_dict[postal]['density'] = df.loc[df['postal_code']==postal]['density'].iloc[0]
    postal_code_feature_dict[postal]['entropy'] = df.loc[df['postal_code']==postal]['entropy'].iloc[0]
    postal_code_feature_dict[postal]['competitiveness'] = df.loc[df['postal_code']==postal]['competitiveness'].iloc[0]
    postal_code_feature_dict[postal]['area_pop'] = df.loc[df['postal_code']==postal]['area_pop'].iloc[0]
    postal_code_feature_dict[postal]['accessibility'] = df.loc[df['postal_code']==postal]['accessibility'].iloc[0]
    postal_code_feature_dict[postal]['complementary'] = df.loc[df['postal_code']==postal]['complementary'].iloc[0]

pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})

Unnamed: 0,postal_code,density,entropy,competitiveness,area_pop,accessibility,complementary
0,46225,6.0,1.791759,-0.166667,1371.0,0.0,0.0
1,46037,6.0,1.791759,-0.166667,1813.0,0.0,0.0
2,33618,13.0,2.564949,-0.076923,2939.0,0.0,0.0
3,33609,11.0,2.397895,-0.090909,3440.0,0.0,0.0
4,63126,3.0,1.098612,-0.333333,369.0,0.0,0.0
...,...,...,...,...,...,...,...
331,18969,1.0,-0.000000,-1.000000,83.0,0.0,0.0
332,46235,1.0,-0.000000,-1.000000,85.0,0.0,0.0
333,08055,1.0,-0.000000,-1.000000,116.0,0.0,0.0
334,18915,1.0,-0.000000,-1.000000,90.0,0.0,0.0


In [14]:
# split train test 
# 拿每一個餐廳一半的分店當作 testing set (雖然大部分只有兩家分店)

cnt = 0
train_df = pd.DataFrame()
test_df = pd.DataFrame()
other_places = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})

for i in Counter(df.name):
    tmp = new_df[new_df.name==i]
    if len(tmp) %2 ==0:
        row_count = int(len(tmp)/2)
    else:
        row_count = int(len(tmp)/2)+1
    train_df = pd.concat([train_df ,tmp.iloc[:-row_count,:]]).reset_index(drop = True)
    test_df = pd.concat([test_df ,tmp.iloc[-row_count:,:]]).reset_index(drop = True)
    
    # reconstruct
    postal_codes = list(other_places.postal_code)
    
    ## train 
    for j in Counter(train_df.postal_code):
        cnt = 0
        concat_index =[]
        for k in range(len(other_places)):
            if cnt <5:
                if other_places[k].postal_code in postal_codes and abs(int(other_places[k].postal_code)-int(j)) <= 5000:
                    postal_codes.remove(other_places[k].postal_code)
                    cnt+=1
                    concat_index.append(k)
        others = other_places.iloc[[concat_index]]
        others['relevance'] = 0
        others['name'] = train_df.name[j]
        train_df = pd.concat([train_df ,others ])
    
    ## test 
    for j in Counter(test_df.postal_code):
        cnt = 0
        concat_index =[]
        for k in range(len(other_places)):
            if cnt <5:
                if other_places[k].postal_code in postal_codes and abs(int(other_places[k].postal_code)-int(j)) <= 5000:
                    postal_codes.remove(other_places[k].postal_code)
                    cnt+=1
                    concat_index.append(k)
        others = other_places.iloc[[concat_index]]
        others['relevance'] = 0
        others['name'] = test_df.name[j]
        test_df = pd.concat([test_df ,others ])

KeyError: 0

In [None]:
# reconstruct testing set

new_test = pd.DataFrame()

for i in Counter(df.name):
#     print(i)
    subset = pd.DataFrame(postal_code_feature_dict).T.reset_index().rename(columns={'index': 'postal_code'})
    subset['relevance']=0
    subset = subset[~subset['postal_code'].isin(train_df[train_df.name==i].postal_code)]
#     print(f'After deleting train : {subset.shape}')
    subset = subset[~subset['postal_code'].isin(test_df[test_df.name==i].postal_code)]
#     print(f'After deleting test : {subset.shape}')
    subset['name']=i
    subset = pd.concat([subset , test_df[test_df.name==i][['name','postal_code','density', 'entropy',\
                                                           'competitiveness','area_pop', 'accessibility',\
                                                           'complementary','relevance']]])
#     print(f'After adding test : {subset.shape}')
    
    # check if the test is constructed properly
    if len(subset) + len(Counter(train_df[train_df.name==i].postal_code)) != len(Counter(df.postal_code)):
        print(f'{i} has some problem constructing the testing dataset')
        print(subset.shape , Counter(df[df.name==i].postal_code))
        break
     
    
    new_test = pd.concat([new_test , subset])

In [None]:
for i in ['density', 'entropy', 'competitiveness','area_pop', 'accessibility','complementary','relevance']:
    train_df[i] = train_df[i].astype('float')
    new_test[i] = new_test[i].astype('float')