## Import required packages

In [4]:
import numpy as np
import pandas as pd
import json

## Open and load Json file

In [6]:
with open('.\hotels.json') as f:
    data = json.load(f)

## Extract field names from json

In [17]:
fields=list()
for i in range(0,len(data["fields"])):
    fields.append(data["fields"][i]["label"])
print(fields)

['HOTEL NAME', 'ADDRESS', 'STATE', 'PHONE', 'FAX', 'EMAIL ID', 'WEBSITE', 'TYPE', 'Rooms']


## Build a dataframe from data of json

In [23]:
df=pd.DataFrame(data["data"],columns=fields)
df.head()

Unnamed: 0,HOTEL NAME,ADDRESS,STATE,PHONE,FAX,EMAIL ID,WEBSITE,TYPE,Rooms
0,The Oberoi Cecil,"""Ambedkar ChowkChaura Maidan, Shimla - 171004,...",HIMACHAL PRADESH,0177-2804848,0177-2811024,reservations@oberoigroup.com,www.oberoihotels.com,Heritage Grand,75
1,The Lalit Grand Palace,"""Gupkar Road, Srinagar, JAMMU AND KASHMIR""",JAMMU AND KASHMIR,1942501001,1942501003,srinagar@thelalit.com,,Heritage Grand,112
2,Hotel Fateh Prakash Palace,"""The City Palace Complex, Udaipur - 313001, RA...",RAJASTHAN,0294-2528016-9,0294-2528006,mgrfpp@udaipur.hrhindia.com,www.hrhindia.com,Heritage Grand,30
3,Shiv Niwas Palace,"""The City Palace ComplexUdaipur, Udaipur, RAJA...",RAJASTHAN,2942528016,2942528006,maltidutta50@yahoo.com,,Heritage Grand,36
4,Savoy Hotel,"""77, Sylks Road, Ootacamund, TAMIL NADU""",TAMIL NADU,423244142,423443318,savoy.ooty@tajhotels.com,,Heritage Grand,40


## Inspect data types

In [24]:
df.dtypes

HOTEL NAME    object
ADDRESS       object
STATE         object
PHONE         object
FAX           object
EMAIL ID      object
WEBSITE       object
TYPE          object
Rooms         object
dtype: object

In [27]:
df['Rooms']=df['Rooms'].astype('int')
df['Rooms'].dtype

dtype('int32')

## Compute Hotels with most rooms under each type

In [63]:
result_idx=df.groupby(['TYPE'])['Rooms'].transform(max) == df['Rooms']
result=df[result_idx].reset_index()
result=result[['HOTEL NAME','STATE','TYPE','Rooms']]

In [64]:
result.shape

(6, 4)

In [65]:
result.head(6)

Unnamed: 0,HOTEL NAME,STATE,TYPE,Rooms
0,The Lalit Grand Palace,JAMMU AND KASHMIR,Heritage Grand,112
1,Noor Us Sabah Palace,MADHYA PRADESH,Heritage Classic,57
2,Jehan Numa Palace Hotel,MADHYA PRADESH,Heritage Basic,98
3,The Taj Mahal Palace & Tower,MAHARASHTRA,5 Star Deluxe,565
4,Novotel Hyderabad Airport,ANDHRA PRADESH,5 Star,305
5,Hotel Trident,MAHARASHTRA,4 Star,436


## Handling large files

In [68]:
#For large files that do not fit into memory, we have to options
#one idea is to process them in chunks and do the computation sequentially.
#Other idea is to distribute and scale them to different machine and do the computation paralelly.
#As I run in a single machine, below implementation follows first approach

In [67]:
CHUNK_SIZE=100

for i in range(0,len(),CHUNK_SIZE):
    print(i)

0
100
200
