In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling as pdp
import gc, os, csv, subprocess, pathlib, sqlite3, logging, time, psycopg2, \
    warnings
from sqlalchemy import create_engine, MetaData, Table, Column, INTEGER, \
    TEXT, FLOAT, VARCHAR, CHAR, NUMERIC, REAL, func, TIMESTAMP, distinct
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from contextlib import closing
from plotnine import *
from collections import defaultdict
import seaborn as sns

plt.rcParams["font.family"] = "IPAGothic"
warnings.filterwarnings('once')
PATH_DB = pathlib.Path("../data/interim/suzuki_skilltransfer.sqlite3")
PATH_INTERIM_DATA = pathlib.Path("../data/interim/")
PATH_RAW_DATA = pathlib.Path("../data/raw/dbhaa0001_tbhaa0006/")
PATH_REPORTS = pathlib.Path("../reports/")

In [13]:
# sqlalchemy engine
engine = create_engine(f'sqlite:///{PATH_DB.resolve().as_posix()}')
meta = MetaData()
Base = automap_base()
Base.prepare(engine, reflect=True)
Session = sessionmaker(bind=engine)

In [14]:
# table format data
format_xlsx = pd.read_excel('../data/external/20190528_data_format_warranty_production.xlsx',
                            sheet_name=None)

  for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator():


In [15]:
# create table
table_list = {}
for sheet_name, sheet in format_xlsx.items():
    columns = [Column(row["name"], eval(row["Type in sqlite3"]), 
                      primary_key=not(pd.isna(row["Primary Key"])),
                      nullable=pd.isna(row["Not Null"]))
              for _, row
              in sheet.iterrows()]
    old_table = Base.classes.get(sheet_name)
    if old_table is not None:
        print(f'Deleting old {sheet_name} table')
        old_table.__table__.drop(engine, checkfirst=True)
    table_list[sheet_name] = Table(sheet_name, meta, *columns)

meta.create_all(engine)

Deleting old warranty table
Deleting old fcok table


In [5]:
PATH_RAW_DATA = pathlib.Path("../data/raw")

In [6]:
os.listdir(PATH_RAW_DATA)

['warranty_fcok',
 'CAN',
 '.DS_Store',
 '.gitkeep',
 'tbhaa_all_201907030445.csv',
 'dbhaa_all_201907030443.csv',
 'FTIR']

In [7]:
!head ../data/raw/tbhaa_all_201907030445.csv

﻿"V_BASE_CD","V_PROD_DIV","V_VIN","V_ENGINE_NO","V_MOTOR_NO","V_MISSON_NO","V_KEY_NO","V_FCOK","V_FCOK_TIME","V_ABOK","V_ABOK_TIME","V_PROD_MODEL_CD","V_SALES_MODEL_CD","V_SALES_MODEL_NM","V_SPEC","V_COLOR","V_PLANT_CD","V_DIST_CD","V_BRANCH_CD","V_SHIPPING_DATE","V_SHIPPING_FCOK","V_SHIPPING_ABOK","V_INVOICE_NO","V_SALES_NO","V_CASE_NO","V_SRC_DIV","V_CRT_TIME_STAMP"
"100100",A,BA43A-111775,A407-111815,,,,"20170111","174400","20170111","110232",XD074,FB50K8,,"000",YHV,,,,,,,,,,"1","18-01-31 10:51:00.986668000"
"100100",A,BA43A-111776,A407-111814,,,,"20170118","094600","20170111","111310",XD074,FB50K8,,"000",YHV,,,,,,,,,,"1","18-01-31 10:51:03.350036000"
"100100",A,BA43A-111777,A407-111812,,,,"20170111","174500","20170111","112238",XD074,FB50K8,,"000",YHV,,,,,,,,,,"1","18-01-31 10:51:00.988499000"
"100100",A,BA43A-111778,A407-111811,,,,"20170117","131000","20170111","113314",XD074,FB50K8,,"000",YHV,,,,,,,,,,"1","18-01-31 10:51:03.089003000"
"100100",A,BA43A-111779,A407-111809,,,,"

In [20]:
col_str

'\ufeff"V_BASE_CD",V_PROD_DIV,V_VIN,V_ENGINE_NO,V_MOTOR_NO,V_MISSON_NO,V_KEY_NO,V_FCOK,V_FCOK_TIME,V_ABOK,V_ABOK_TIME,V_PROD_MODEL_CD,V_SALES_MODEL_CD,V_SALES_MODEL_NM,V_SPEC,V_COLOR,V_PLANT_CD,V_DIST_CD,V_BRANCH_CD,V_SHIPPING_DATE,V_SHIPPING_FCOK,V_SHIPPING_ABOK,V_INVOICE_NO,V_SALES_NO,V_CASE_NO,V_SRC_DIV,V_CRT_TIME_STAMP'

In [22]:
%%time
# sqlalchemyは大量データのインサートには使いづらいため、sqlite3ライブラリを用いる
error_row = [] # エラーのある行を記録するリスト

for filename in [file for file in os.listdir(PATH_RAW_DATA) if "csv" in file]:
    start_time = time.time()
    print(f"proccesing {filename}...", end="")
    table_name = filename[:5] # ex. dbhaa
    if table_name == "dbhaa":
        table_name = "warranty"
    else:
        table_name = "fcok"
    question_str = ",".join(["?"] * len(table_list[table_name].columns)) # ex. ?,?,?,...,?
    
    with open(PATH_RAW_DATA/filename) as f:
        with closing(sqlite3.connect(PATH_DB.as_posix())) as conn:
            c = conn.cursor()
            reader = csv.reader(f)
            header = next(reader)
            col_str = ",".join(header)
            if table_name == "fcok": col_str = """
            V_BASE_CD,V_PROD_DIV,V_VIN,V_ENGINE_NO,V_MOTOR_NO,V_MISSON_NO,
            V_KEY_NO,V_FCOK,V_FCOK_TIME,V_ABOK,V_ABOK_TIME,V_PROD_MODEL_CD,
            V_SALES_MODEL_CD,V_SALES_MODEL_NM,V_SPEC,V_COLOR,V_PLANT_CD,
            V_DIST_CD,V_BRANCH_CD,V_SHIPPING_DATE,V_SHIPPING_FCOK,
            V_SHIPPING_ABOK,V_INVOICE_NO,V_SALES_NO,V_CASE_NO,
            V_SRC_DIV,V_CRT_TIME_STAMP
            """
            sql = f'insert into {table_name} ({col_str}) values ({question_str})'

            i = 1 # include header
            while True:
                try:
                    val = None
                    val = next(reader)
                    val = [x if x != "" else None for x in val]
                    c.execute(sql, val)
                except (StopIteration, KeyboardInterrupt):
                    break
                except:
                    raise
                i += 1
                if i % 100_000 == 0: conn.commit() # for speed-up, decrease frequency of commit
            conn.commit()
    print(f"Done. {time.time() - start_time: .1f} s elappsed.")

proccesing tbhaa_all_201907030445.csv...Done.  75.0 s elappsed.
proccesing dbhaa_all_201907030443.csv...Done.  17.1 s elappsed.
CPU times: user 1min 28s, sys: 2.53 s, total: 1min 30s
Wall time: 1min 32s


In [23]:
pd.read_sql("select * from warranty limit 5;", engine)

Unnamed: 0,W_KEY,W_BASE_CD,W_PROD_BASE_CD,W_FIELD_CLAIM_DIV,W_PROD_DIV,W_DMEX_DIV,W_PROCESS_MONTH,W_DIST_CD,W_DIST_COUNTRY_CD,W_DIST_ISSUE_NO,...,W_SUBLET_AMT,W_SHIPPING_AMT,W_TOTAL_AMT,W_CFC_COMPANY_CD,W_CFC_RATE,W_CFC_AMT,W_FC_TO_FAILURE,W_SALES_TO_FAILURE,W_SB_NO,W_CRT_TIME_STAMP
0,WRAEJP201702A000001,100100,100100,2,A,1,201702,2505A,AE,00A10709,...,0.0,679.0,4145.0,P58X,0,0.0,107,102,,17-08-02 18:34:49.631885000
1,WRAEJP201702C000001,100100,100100,2,C,1,201702,2505C,AE,00MB2189,...,0.0,0.0,1358.0,3223,100,1358.0,39,36,,17-08-02 18:38:14.568529000
2,WRAEJP201702C000002,100100,100100,2,C,1,201702,2505C,AE,00MB2188,...,0.0,0.0,1358.0,3223,100,1358.0,39,36,,17-08-02 18:38:14.570739000
3,WRAEJP201702C000003,100100,100100,1,C,1,201702,2505C,AE,00MB0864,...,0.0,287.0,5660.0,3962,50,2656.0,20,17,,17-08-02 18:38:15.016027000
4,WRAEJP201702C000004,100100,100100,1,C,1,201702,2505C,AE,0MB2252A,...,0.0,1171.0,9152.0,3479,50,3674.0,28,23,,17-08-02 18:38:16.974580000


In [24]:
pd.read_sql("select * from fcok limit 5;", engine)

Unnamed: 0,V_BASE_CD,V_PROD_DIV,V_VIN,V_ENGINE_NO,V_MOTOR_NO,V_MISSON_NO,V_KEY_NO,V_FCOK,V_FCOK_TIME,V_ABOK,...,V_DIST_CD,V_BRANCH_CD,V_SHIPPING_DATE,V_SHIPPING_FCOK,V_SHIPPING_ABOK,V_INVOICE_NO,V_SALES_NO,V_CASE_NO,V_SRC_DIV,V_CRT_TIME_STAMP
0,100100,A,BA43A-111775,A407-111815,,,,20170111,174400,20170111,...,,,,,,,,,1,18-01-31 10:51:00.986668000
1,100100,A,BA43A-111776,A407-111814,,,,20170118,94600,20170111,...,,,,,,,,,1,18-01-31 10:51:03.350036000
2,100100,A,BA43A-111777,A407-111812,,,,20170111,174500,20170111,...,,,,,,,,,1,18-01-31 10:51:00.988499000
3,100100,A,BA43A-111778,A407-111811,,,,20170117,131000,20170111,...,,,,,,,,,1,18-01-31 10:51:03.089003000
4,100100,A,BA43A-111779,A407-111809,,,,20170111,193400,20170111,...,,,,,,,,,1,18-01-31 10:51:00.990269000


In [25]:
pd.read_sql("select count(*) from warranty;", engine)

Unnamed: 0,count(*)
0,805134


In [26]:
pd.read_sql("select count(*) from fcok;", engine)

Unnamed: 0,count(*)
0,3679940
