# Hbase in Python
## Requirements
- Hbase Installed
- Start Hbase: ./start-hase.sh
    - This will require the Zookeeper running, or in standalone mode the conf file specifications
- Start the Hbase thrift server
    - ./hbase thrift start

## Util: Create Table: Creates Table if not already present

In [74]:
import happybase

hostName='localhost'
tableName='HBase-Test'
column_family_name='cerebral-cortex'
column_qualifier_name1='accx'
column_qualifier_name2='accy'
column_qualifier_name3='accz'

# Creating the Table in which we will insert the data
def CreateTable():
    print("-"*200)
    print("CreateTable Called")
    connection = happybase.Connection(hostName)
    connection.open()
    tables = connection.tables()
    if tableName.encode() in tables :
        print("Table", tableName,"already exist")
        print("Current Tables in the Hbase:\n",tables)
    else:
        print("Current Tables in the Hbase:\n"+tables)
        connection.create_table(
             tableName,
             {
              column_family_name: dict(),  # use defaults
              }
             )
        print('Table',tableName, 'created')

# Util: Inserts Data

In [163]:
# Given a rowkey inserts the data point
# rowkey: String
# data=[TimeStamp,ax,ay,az], TimeStamp is used as rowkey
def InsertData_Point(data):
    print("-"*200)
    #print("InsertData_Point Called")
    connection = happybase.Connection(hostName)
    connection.open()
    table = connection.table(tableName)
    
    column_name1 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name1)
    column_name2 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name2)
    column_name3 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name3)
    
    # Format of Put
    #table.put(b'row-key', {b'family:qual1': b'value1',
    #                   b'family:qual2': b'value2'})
    
    # HBase does not have any notion of data types; 
    # all row keys, column names and column values are simply treated as raw byte strings.
    # This means that data must be converted to byte strings 
    # in application before you pass it to HappyBase
    rowkey=data[0].encode('utf-8')
    ax=data[1].encode('utf-8')
    ay=data[2].encode('utf-8')
    az=data[3].encode('utf-8')
    
    table.put(rowkey, {column_name1: bytes(ax),column_name2: bytes(ay),column_name3: bytes(az)})
    print("InsertData_Point Done")
    
# Insert a batch of data points
# rowkeylist: list(String)
# datalist=list([ax,ay,az])
def InsertData_Batch(datalist):
    print("-"*200)
    #print("InsertData_Batch Called")
    connection = happybase.Connection(hostName)
    connection.open()
    table = connection.table(tableName)
    
    column_name1 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name1)
    column_name2 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name2)
    column_name3 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name3)
    
    
    #batch_size acts as a threshold, when to send commands to HBase Server, 1000 bytes
    with table.batch(batch_size=1000) as b:
        #for i in range(len(rowkeylist)):
        for rowkey,ax,ay,az in datalist:
            #print(rowkey,ax,ay,az)
            b.put(rowkey.encode('utf-8'), {column_name1: bytes(ax.encode('utf-8')),column_name2: bytes(ay.encode('utf-8')),column_name3: bytes(az.encode('utf-8'))})
            
    print("InsertData_Batch Done")

# Util: GetData

In [176]:
# rowkey: is String
def GetData(rowkey):
    print("-"*200)
    print("GetData Called")
    connection = happybase.Connection(hostName)
    connection.open()
    table = connection.table(tableName)
    key = rowkey.encode('utf-8')
    column_name1 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name1)
    column_name2 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name2)
    column_name3 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name3)
    
    row = table.row(key)
    #ax=row[column_name1.encode('utf-8')].decode()
    #ay=row[column_name2.encode('utf-8')].decode()
    #az=row[column_name3.encode('utf-8')].decode()
    #print("Data:", ax,ay,az)
    print(row)

def GetData_list(rowkeylist):
    print("-"*200)
    print("GetData_list Called")
    connection = happybase.Connection(hostName)
    connection.open()
    table = connection.table(tableName)
    
    column_name1 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name1)
    column_name2 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name2)
    column_name3 = '{fam}:{qual}'.format(fam=column_family_name,qual=column_qualifier_name3)
    
    rowbyte=[]
    for row in rowkeylist:
        rowbyte.append(row.encode('utf-8'))
    
    rows = table.rows(rowbyte)
    for key, data in rows:
        print(key, data)

### Simple Inserting Point Data and Querying it

In [177]:
CreateTable()
InsertData_Point(['rowKey','2.45','1.32','-9.43'])
GetData('rowKey')

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
CreateTable Called
Table HBase-Test already exist
Current Tables in the Hbase:
 [b'Geomesa', b'Geomesa_GeoMesaTable_id', b'Geomesa_GeoMesaTable_z2', b'Geomesa_GeoMesaTable_z3', b'Geomesa_MetroInsight_id', b'Geomesa_MetroInsight_z2', b'Geomesa_MetroInsight_z3', b'HBase-Test']
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
InsertData_Point Done
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GetData Called
{b'cerebral-cortex:accz': b'-9.43', b'cerebral-cortex:accy': b'1.32', b'cerebral-cor

### Insertion using Batch in Hbase


In [178]:
InsertData_Batch([['rowKey1','2.45','1.32','-9.43'],
                 ['rowKey2','3.45','2.32','-10.43'],
                 ['rowKey3','4.45','3.32','-11.43'],
                 ['rowKey4','5.45','4.32','-12.43']])

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
InsertData_Batch Done


### Querying a Batch of Data

In [181]:
GetData_list(['rowKey1',
             'rowKey2',
             'rowKey3',
             'rowKey4'])

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GetData_list Called
b'rowKey1' {b'cerebral-cortex:accz': b'-9.43', b'cerebral-cortex:accy': b'1.32', b'cerebral-cortex:accx': b'2.45'}
b'rowKey2' {b'cerebral-cortex:accz': b'-10.43', b'cerebral-cortex:accy': b'2.32', b'cerebral-cortex:accx': b'3.45'}
b'rowKey3' {b'cerebral-cortex:accz': b'-11.43', b'cerebral-cortex:accy': b'3.32', b'cerebral-cortex:accx': b'4.45'}
b'rowKey4' {b'cerebral-cortex:accz': b'-12.43', b'cerebral-cortex:accy': b'4.32', b'cerebral-cortex:accx': b'5.45'}


# BenchMarking:
  - Create a synthetic dataset
  - Batch Insert the data
  - Perform different kind of queries.

### Create a synthetic dataset

In [221]:
# Create a CSV file with random data points

# Data format: 'timestamp', 'acc_x','acc_y','acc_z'
# acc_x ={0,10}, acc_y ={0,10}, acc_y ={0,10}
# timestamp=(2017, to 2018)
import csv
import random
import datetime 
import time

#Name of file
#Count of data itmes
def CreateData(name,count):
    with open(name, 'w') as csvfile:
        fieldnames = ['timestamp', 'acc_x','acc_y','acc_z']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        #datetime(year, month, day[, hour[, minute[, second[, microsecond[,tzinfo]]]]])
        startDate = datetime.datetime(2017, 1, 1,0,0,0)
        seconds_per_year=365*24*60*60
        
        for i in range(count):
            acc_x=random.uniform(0, 10)
            acc_y=random.uniform(0, 10)
            acc_z=random.uniform(0, 10)
            Date = startDate + datetime.timedelta(seconds=random.uniform(0, seconds_per_year))
            TimeStamp = time.mktime(Date.timetuple())
            writer.writerow({'timestamp': TimeStamp, 'acc_x': acc_x,'acc_y': acc_y,'acc_z': acc_z})
            
CreateData('test.csv',10000)

### Insert the Synthetic DataSet into the Hbase

In [224]:
def InsertData_BenchMark(name):
    print()