### Imports

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# Suggestions :
#     *Send the file over-uploaded it on github
#     *send data to postgres in AWS cloud -  remote database
#     *Put links where from data is avaliable
#     *Create notebook with decoratiors for interactive - present if possible

In [3]:
## imports
import pandas as pd
import numpy as np
import sys
from __future__ import print_function # 
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets



sys.path.insert(0, "../") # for utlitly module
from utility import converter, exporter, missing

### Setup

#### CSV, Numpy, Parquet

In [21]:
## change contents of this cell for loading files
FILE_NAME = "../data/new_merchant_transactions.csv"
FILE_FORMAT = "csv" # csv, numpy, parquet


NEW_FILE_NAME = "../data/new_merchant_transactions.npy"
NEW_FILE_FORMAT = "numpy" # csv, numpy, parquet

In [22]:
# load data
data = exporter.load_file(FILE_NAME, FILE_FORMAT)


#### Database - Postgres

In [23]:
table_name = ""
username = "sahil"
database = "spark_demo_db"
password = "1234567890"
host = "localhost"

In [24]:
## Read from Database
from sqlalchemy import create_engine
if table_name != "":
    engine = create_engine('postgresql://{}:{}@{}:5432/{}'.format(username, password, host, database))
    data = pd.read_sql_table(table_name, engine)

### Basic Sanity Check
* Convert column to lower case - replace spaces with underscore
* Shape of dataframe
* Column and their data types
* Missing value counts
* Genereal statistics about the data

In [25]:
# Convert Column names to lower case, replace spaces with _ (underscore)
data.columns = [x.lower().strip().replace(" ", "_") for x in data.columns]

In [26]:
## Display column names
from pprint import pprint
pprint(sorted(list(data.columns)))

['authorized_flag',
 'card_id',
 'category_1',
 'category_2',
 'category_3',
 'city_id',
 'installments',
 'merchant_category_id',
 'merchant_id',
 'month_lag',
 'purchase_amount',
 'purchase_date',
 'state_id',
 'subsector_id']


In [27]:
## Shape of dataframe
data.shape

(1963031, 14)

In [28]:
def dataframe_dtypes(data):
    return pd.DataFrame(data.dtypes, columns=["datatype"]).sort_index()
dataframe_dtypes(data)

Unnamed: 0,datatype
authorized_flag,object
card_id,object
category_1,object
category_2,float64
category_3,object
city_id,int64
installments,int64
merchant_category_id,int64
merchant_id,object
month_lag,int64


In [29]:
## show data
data.head(5)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [30]:
## put more descriptions of how this was achieved-
## spend time to understand how decorators work - Object Oriented Python , TDD
@interact
def show_articles_more_than(column=sorted(list(data.columns))):
    return data[column].head(15)

### Convert to correct type

In [31]:
converter.convert_to_date(data, ["purchase_date"])
# converter.convert_to_category(data, ["authorized_flag", "category_1", "category_3", 'state_id'])

In [32]:
data.head(5)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [33]:
dataframe_dtypes(data)

Unnamed: 0,datatype
authorized_flag,object
card_id,object
category_1,object
category_2,float64
category_3,object
city_id,int64
installments,int64
merchant_category_id,int64
merchant_id,object
month_lag,int64


### Missing Value Statistics

In [34]:
missing.missing_stats(data)

Unnamed: 0,missing,percent_missing
category_2,111745,5.692473
category_3,55922,2.848758
merchant_id,26216,1.335486
authorized_flag,0,0.0
card_id,0,0.0
city_id,0,0.0
category_1,0,0.0
installments,0,0.0
merchant_category_id,0,0.0
month_lag,0,0.0


### General Statistics

In [35]:
data.describe()

Unnamed: 0,city_id,installments,merchant_category_id,month_lag,purchase_amount,category_2,state_id,subsector_id
count,1963031.0,1963031.0,1963031.0,1963031.0,1963031.0,1851286.0,1963031.0,1963031.0
mean,134.3867,0.6829643,430.9701,1.476515,-0.550969,2.197841,10.88067,25.97624
std,101.5152,1.584069,246.3385,0.4994483,0.6940043,1.528125,6.038542,10.12908
min,-1.0,-1.0,-1.0,1.0,-0.7468928,1.0,-1.0,-1.0
25%,69.0,0.0,278.0,1.0,-0.7166294,1.0,9.0,19.0
50%,110.0,1.0,367.0,1.0,-0.6748406,1.0,9.0,29.0
75%,212.0,1.0,683.0,2.0,-0.5816162,3.0,15.0,34.0
max,347.0,999.0,891.0,2.0,263.1575,5.0,24.0,41.0


### Reduce Memory Useage if possible

In [36]:
data = exporter.reduce_mem_usage(data)

Memory usage of dataframe is 209.67 MB
Memory usage after optimization is: 114.20 MB
Decreased by 45.5%


In [37]:
dataframe_dtypes(data)

Unnamed: 0,datatype
authorized_flag,object
card_id,object
category_1,object
category_2,float16
category_3,object
city_id,int16
installments,int16
merchant_category_id,int16
merchant_id,object
month_lag,int8


### Export File to required_format

In [38]:
exporter.save_file(data, NEW_FILE_NAME, NEW_FILE_FORMAT)