In [3]:
import pandas as pd
import numpy
import profile
from models import engine

dtype = {
    '交易id': numpy.str,
    '資料日期': numpy.str,
    '資料時間': numpy.str,
    '餐別帶': numpy.str,
    '縣市別': numpy.str,
    '店舖代號': numpy.uint32,
    '主商圈': numpy.str,
    '品號-品名稱': numpy.str,
    '群號-群名稱': numpy.str,
    '單品名稱': numpy.str,
    '銷售數量': numpy.uint16,
    '銷售單價': numpy.float,
    '交易金額': numpy.float
}

USE_COLUMNS = ['交易id', '資料日期', '資料時間', '餐別帶', '縣市別', '店舖代號', '主商圈', '品號-品名稱',
               '群號-群名稱', '單品名稱', '銷售數量', '銷售單價', '交易金額']
PARSE_DATES = {
    '資料日期與時間': [
        '資料日期',
        '資料時間'
    ]
}

def query_location_func(df):
    return df['縣市別'] == '台中市'

def query_time_phase_func(df):
    return df['餐別帶'] == '早餐時間帶'

def query_both_func(df):
    return (query_location_func(df)) & (query_time_phase_func(df))

def read_from_csv(limit_records, query_func):
    records = pd.read_csv('customer_data(utf-8).csv',
                                 index_col=1,
                                 nrows=limit_records,
                                 usecols=USE_COLUMNS,
                                 dtype=dtype,
                                 parse_dates=PARSE_DATES,
                                 )
    for record in records:
        pass
    return records

def read_from_database(query):
    records = pd.read_sql_query(query, con=engine, chunksize=40000)
    for record in records:
        pass
    return records

read_query = """
SELECT *
FROM transaction_item
JOIN transactions ON transaction_item.transaction_id = transactions.id
JOIN items ON items.id = transaction_item.item_id;
"""

query_location = """
SELECT *
FROM transaction_item
JOIN transactions ON transaction_item.transaction_id = transactions.id
JOIN items ON items.id = transaction_item.item_id
WHERE transactions.location = '台中市';
"""

query_location_time_phase = """
SELECT *
FROM transaction_item
JOIN transactions ON transaction_item.transaction_id = transactions.id
JOIN items ON items.id = transaction_item.item_id
WHERE transactions.location = '台中市' AND transactions.time_phase = '午餐時間帶'
LIMIT 1000;
"""

In [5]:
profiler = profile.Profile()
profiler.runcall(read_from_csv, 1000, query_both_func)
profiler.print_stats()

         8570 function calls (8502 primitive calls) in 0.048 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      9/3    0.000    0.000    0.002    0.001 :0(__import__)
        7    0.000    0.000    0.000    0.000 :0(__new__)
       36    0.000    0.000    0.000    0.000 :0(acquire_lock)
        5    0.000    0.000    0.000    0.000 :0(all)
       24    0.000    0.000    0.000    0.000 :0(allocate_lock)
        2    0.000    0.000    0.000    0.000 :0(any)
       32    0.000    0.000    0.000    0.000 :0(append)
        6    0.000    0.000    0.000    0.000 :0(arange)
       52    0.000    0.000    0.000    0.000 :0(array)
        3    0.000    0.000    0.000    0.000 :0(array_equivalent_object)
        1    0.000    0.000    0.000    0.000 :0(array_to_datetime)
        4    0.000    0.000    0.000    0.000 :0(callable)
        3    0.000    0.000    0.002    0.001 :0(clean_index_list)
        1    0.000    0.000    0.000

In [7]:
profiler = profile.Profile()
profiler.runcall(read_from_database, query_location_time_phase)
profiler.print_stats()

         4156 function calls (4124 primitive calls) in 0.027 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      3/1    0.000    0.000    0.001    0.001 :0(__import__)
       19    0.000    0.000    0.000    0.000 :0(__new__)
        1    0.000    0.000    0.000    0.000 :0(_distill_params)
        2    0.000    0.000    0.000    0.000 :0(_is_owned)
        2    0.000    0.000    0.000    0.000 :0(acquire)
       12    0.000    0.000    0.000    0.000 :0(acquire_lock)
        1    0.000    0.000    0.000    0.000 :0(add)
        8    0.000    0.000    0.000    0.000 :0(allocate_lock)
        2    0.000    0.000    0.000    0.000 :0(any)
       63    0.000    0.000    0.000    0.000 :0(append)
        5    0.000    0.000    0.000    0.000 :0(arange)
       58    0.000    0.000    0.000    0.000 :0(array)
        1    0.000    0.000    0.000    0.000 :0(array_to_datetime)
        1    0.000    0.000    0.001    0.001 :0(cle

In [18]:
def read_by_chunk(chunksize):
    records = pd.read_csv('customer_data(utf-8).csv',
                                 index_col=1,
                                 chunksize=chunksize,
                                 usecols=USE_COLUMNS,
                                 dtype=dtype,
                                 parse_dates=PARSE_DATES,
                                 )
    for record in records:
        pass
    return records

def read():
    records = pd.read_csv('customer_data(utf-8).csv',
                                 index_col=1,
                                 usecols=USE_COLUMNS,
                                 dtype=dtype,
                                 parse_dates=PARSE_DATES,
                                 )
    return records   

In [20]:
profiler = profile.Profile()
profiler.runcall(read_by_chunk, 40000)
profiler.print_stats()

         20536383 function calls (20529191 primitive calls) in 103.315 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  759/253    0.004    0.000    0.165    0.001 :0(__import__)
      632    0.002    0.000    0.002    0.000 :0(__new__)
     3036    0.006    0.000    0.006    0.000 :0(acquire_lock)
      630    0.003    0.000    0.048    0.000 :0(all)
     2024    0.005    0.000    0.005    0.000 :0(allocate_lock)
      252    0.001    0.000    0.006    0.000 :0(any)
     3907    0.011    0.000    0.011    0.000 :0(append)
      756    0.003    0.000    0.003    0.000 :0(arange)
     6552    0.134    0.000    0.134    0.000 :0(array)
      378    0.003    0.000    0.003    0.000 :0(array_equivalent_object)
      126    1.058    0.008    1.058    0.008 :0(array_to_datetime)
      129    0.000    0.000    0.000    0.000 :0(callable)
      378    0.009    0.000    0.214    0.001 :0(clean_index_list)
        1    0.000    0.00

In [19]:
profiler = profile.Profile()
profiler.runcall(read)
profiler.print_stats()

         20035587 function calls (20035520 primitive calls) in 99.915 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      9/3    0.000    0.000    0.002    0.001 :0(__import__)
        7    0.000    0.000    0.000    0.000 :0(__new__)
       36    0.000    0.000    0.000    0.000 :0(acquire_lock)
        5    0.000    0.000    0.000    0.000 :0(all)
       24    0.000    0.000    0.000    0.000 :0(allocate_lock)
        2    0.000    0.000    0.000    0.000 :0(any)
       32    0.000    0.000    0.000    0.000 :0(append)
        6    0.000    0.000    0.000    0.000 :0(arange)
       52    0.094    0.002    0.094    0.002 :0(array)
        3    0.000    0.000    0.000    0.000 :0(array_equivalent_object)
        1    1.051    1.051    1.051    1.051 :0(array_to_datetime)
        4    0.000    0.000    0.000    0.000 :0(callable)
        3    0.000    0.000    0.002    0.001 :0(clean_index_list)
        1    0.000    0.000