In [8]:
import os
import re
from decimal import Decimal, ROUND_HALF_UP
from collections import defaultdict
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgbm
import xgboost as xgb

import joblib
from sklearn.metrics import mean_squared_error

import pandas_profiling as pdp

import warnings
warnings.filterwarnings('ignore')

In [9]:
# XXXXを任意のディレクトリやファイル名に書き換える

# train/testのディレクトリ
INPUT_DIR = './input'
# model/submissionの出力先
OUTPUT_DIR = '/output'

# ファイル名
MODEL_NAME = 'first_commit'
CSV_NAME = 'first_commit'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [10]:
# データ読み込み
train = pd.read_csv(f'{INPUT_DIR}/train.csv', index_col='id')
test = pd.read_csv(f'{INPUT_DIR}/test.csv', index_col='id')
df = pd.concat([train, test], ignore_index=True)
df.reset_index(drop=True, inplace=True)

df.fillna({'thumbnail_url':0,})

Unnamed: 0,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,description,...,longitude,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode,y
0,6,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",2.0,Real Bed,1.0,4.0,flexible,LA,t,My place is meant for family and a few friends...,...,-118.154761,The Penthouse,,1,Apartment,60.0,Private room,0,90804,138.0
1,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,DC,t,This is a new listing for a lovely guest bedro...,...,-76.978190,Guest Bedroom in Brookland,Brookland,9,House,100.0,Private room,https://a0.muscache.com/im/pictures/e4d8b51f-6...,20018,42.0
2,2,"{TV,Internet,""Wireless Internet"",Kitchen,""Indo...",2.0,Real Bed,1.0,1.0,strict,NYC,t,We're looking forward to your stay at our apt....,...,-73.926240,Clean Modern Room in Lux Apt 1 Block From J Train,Bushwick,27,Apartment,83.0,Private room,https://a0.muscache.com/im/pictures/5ffecc9b-d...,,65.0
3,2,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.0,Real Bed,1.0,1.0,strict,SF,t,BEST CITY VIEWS - - ROOF DECK W/ BBQ & WiFi - ...,...,-122.411906,BEST views + reviews! 5/5 stars*****,Nob Hill,38,Apartment,95.0,Private room,0,94133,166.0
4,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,1.0,1.0,strict,NYC,t,Charming Apartment on the upper west side of M...,...,-73.974691,Charming 1-bedroom - UWS Manhattan,Upper West Side,5,Apartment,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/92879730/5...,10024,165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74106,4,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,2.0,2.0,strict,NYC,t,"The Greenhouse, located on Green Street, is a ...",...,-73.954892,Spacious 2BR Greenpoint Getaway,Greenpoint,0,Apartment,,Entire home/apt,https://a0.muscache.com/im/pictures/57338613/6...,11222,
74107,2,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.0,Real Bed,2.0,1.0,flexible,Chicago,f,"Two bedroom, one bathroom with large dining/li...",...,-87.672018,Walk up Apartment in Lakeview/Wrigleyville,Lakeview,9,Apartment,90.0,Entire home/apt,0,60657,
74108,5,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",1.5,Real Bed,2.0,2.0,flexible,Chicago,t,Happy Holidays! If you're looking for a big op...,...,-87.708087,Beautiful Logan Square Home,Avondale,0,House,,Entire home/apt,https://a0.muscache.com/im/pictures/361642af-e...,60618,
74109,2,"{Internet,""Wireless Internet"",""Air conditionin...",1.0,Real Bed,1.0,2.0,strict,NYC,t,This is a cozy one-bedroom apartment a few blo...,...,-73.947358,Charming 1 BR apartment east of Central Park,East Harlem,4,Apartment,95.0,Entire home/apt,https://a0.muscache.com/im/pictures/49c8e83f-d...,10029.0,


In [11]:
def amenities2onehot():
   def shapeAmenity(x):
      '''
      shape amenities as a list of amenities for each record
      '''
      tmp_str = x[1:-2]
      tmp_str = tmp_str.replace('"', '')
      tmp_str = tmp_str.replace("'", "")
      tmp_str = tmp_str.split(',')
      return tmp_str
   
   def makeAmenityOnehot(x:list,amenity_category:list) -> list:
      '''
      x: record of amenities formated as a list
      amenity_category: a list of amenities entire the table 
      
      description:
         execute onehot encoding
         tmp order [dryer,free parking on premises, kitchen, smoke detector, TV, Washer, Wireless Internet....]
      '''
      category = [0 for i in range(len(amenity_category))]
      
      for i in x:
         for j in range(len(amenity_category)):
            if i == amenity_category[j]:
               category[j] = 1
            
      return category
   
   #format amenities column
   df.amenities = df.amenities.map(shapeAmenity)
   
   #generate list of amenities
   amenity_dict = defaultdict(int)
   for i in list(df.amenities):
      for j in i:
         amenity_dict[j] += 1
   amenity_category = list(amenity_dict.keys())
   
   #make onehot-encoded list
   tmp_onehot = list()
   for k in df.amenities:
      tmp_onehot.append(makeAmenityOnehot(k,amenity_category))
      
   amenity_category_df = pd.DataFrame(tmp_onehot)
   #amenity_category_df = amenity_category_df.set_axis(amenity_category, axis = 1)
      
   return amenity_category_df

out_df = amenities2onehot()

In [12]:
out_df.to_pickle('./output/amenities.pickle')

In [13]:
import pickle
with open('./output/amenities.pickle', 'wb') as f:
   pickle.dump(out_df, f)