# Imports

In [170]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing

In [171]:
# loading data
data_frame = pd.read_csv(r'./DATA/H2HBABBA3195.csv')

In [172]:
data_frame.head()

Unnamed: 0,business_code,cust_number,name_customer,clear_date,buisness_year,doc_id,posting_date,document_create_date,document_create_date.1,due_in_date,invoice_currency,document type,posting_id,area_business,total_open_amount,baseline_create_date,cust_payment_terms,invoice_id,isOpen
0,U001,200779906,BOZZU corporation,2019-03-28 00:00:00,2019.0,1928941000.0,2019-03-12,20190310,20190312,20190327.0,USD,RV,1.0,,44468.83,20190312.0,NAA8,1928941000.0,0
1,U001,200769623,WAL-MAR associates,2019-04-03 00:00:00,2019.0,1929015000.0,2019-03-24,20190323,20190324,20190408.0,USD,RV,1.0,,4013.75,20190324.0,NAH4,1929015000.0,0
2,U001,200772670,ASSOCIAT corp,2019-05-22 00:00:00,2019.0,1929268000.0,2019-05-06,20190506,20190506,20190521.0,USD,RV,1.0,,77909.09,20190506.0,NAU5,1929268000.0,0
3,U001,100044041,DEF. F co,2020-03-02 00:00:00,2020.0,1930548000.0,2020-02-20,20200220,20200220,20200310.0,USD,RV,1.0,,2459.37,20200216.0,NAM4,1930548000.0,0
4,U001,200705742,DOT corporation,2019-05-15 00:00:00,2019.0,1929234000.0,2019-04-30,20190430,20190430,20190603.0,USD,RV,1.0,,10073.68,20190430.0,NAAW,1929234000.0,0


In [173]:
data_frame.nunique(axis=0)

business_code                 6
cust_number                1394
name_customer              4195
clear_date                  402
buisness_year                 2
doc_id                    48806
posting_date                507
document_create_date        509
document_create_date.1      507
due_in_date                 545
invoice_currency              2
document type                 2
posting_id                    1
area_business                 0
total_open_amount         44262
baseline_create_date        509
cust_payment_terms           75
invoice_id                48802
isOpen                        2
dtype: int64

In [174]:
data_frame['cust_payment_terms'].value_counts()

NAA8    20052
NAH4    13697
CA10     3726
NAC6     1774
NAM4     1369
        ...  
NAV2        1
NAUI        1
B112        1
MC15        1
NATW        1
Name: cust_payment_terms, Length: 75, dtype: int64

In [175]:
df = data_frame[data_frame.clear_date.isnull()]
df.shape

(10000, 19)

In [176]:
data_frame.corr()

Unnamed: 0,buisness_year,doc_id,document_create_date,document_create_date.1,due_in_date,posting_id,area_business,total_open_amount,baseline_create_date,invoice_id,isOpen
buisness_year,1.0,-0.007851,0.97712,0.98362,0.988944,,,0.001232,0.983986,-0.006627,0.75055
doc_id,-0.007851,1.0,-0.004729,-0.006722,-0.010245,,,0.191287,-0.004197,1.0,-0.006992
document_create_date,0.97712,-0.004729,1.0,0.993404,0.973043,,,0.004664,0.99266,-0.00332,0.760251
document_create_date.1,0.98362,-0.006722,0.993404,1.0,0.978904,,,0.003409,0.99925,-0.005299,0.76035
due_in_date,0.988944,-0.010245,0.973043,0.978904,1.0,,,0.000899,0.979473,-0.008889,0.750627
posting_id,,,,,,,,,,,
area_business,,,,,,,,,,,
total_open_amount,0.001232,0.191287,0.004664,0.003409,0.000899,,,1.0,0.004271,0.197607,0.00985
baseline_create_date,0.983986,-0.004197,0.99266,0.99925,0.979473,,,0.004271,1.0,-0.002698,0.760312
invoice_id,-0.006627,1.0,-0.00332,-0.005299,-0.008889,,,0.197607,-0.002698,1.0,-0.006106


## Dropping blank and unwanted columns

In [177]:
# name_customer can be identified by cust_number also contains anomalies
# doc_id and invoice_id have very high correlation hence deleting invoice_id
# document type is almost constant with just only 0.008% reading having a different value 
# posting_id is constant column
# area_business is empty
# dropping isOpen as open invoices have NULL values in clear_date

data_frame.drop(['document_create_date.1','name_customer','invoice_id','document type','posting_id','area_business','isOpen'],axis=1,inplace=True)

In [178]:
# removing duplicate rows (based on invoice_id)
data_frame.drop_duplicates(subset='doc_id',keep='first',inplace=True)

In [179]:
# checking Nulls in columns for Null Imputation
data_frame.isna().sum()

business_code              0
cust_number                0
clear_date              9690
buisness_year              0
doc_id                     0
posting_date               0
document_create_date       0
due_in_date                0
invoice_currency           0
total_open_amount          0
baseline_create_date       0
cust_payment_terms         0
dtype: int64

In [180]:
# dividing data into final_test and train based on clear_date parameter being NULL
final_test = data_frame[data_frame.clear_date.isnull()]
main_train = data_frame[data_frame.clear_date.isnull()==False]

In [193]:
main_train['doc_id'].value_counts()

1.929488e+09    1
1.929562e+09    1
1.929021e+09    1
1.930516e+09    1
1.930384e+09    1
               ..
1.929173e+09    1
1.929223e+09    1
1.930352e+09    1
1.928565e+09    1
1.930303e+09    1
Name: doc_id, Length: 39116, dtype: int64

In [182]:
# creating binary lable for business_code
main_train['business_code'] = np.where(main_train['business_code'].isin(['U001','CA02']),main_train['business_code'],'OTHR')


In [183]:
main_train['clear_date'] = pd.to_datetime(main_train['clear_date'])
main_train['posting_date'] = pd.to_datetime(main_train['posting_date'])

In [184]:
def convert_to_date(df,attribute):
    df[attribute] = pd.to_datetime(df[attribute], format='%Y%m%d')
    

In [185]:
convert_to_date(main_train,'document_create_date')
convert_to_date(main_train,'baseline_create_date')
convert_to_date(main_train,'due_in_date')


In [186]:
print(main_train.dtypes)

business_code                   object
cust_number                     object
clear_date              datetime64[ns]
buisness_year                  float64
doc_id                         float64
posting_date            datetime64[ns]
document_create_date    datetime64[ns]
due_in_date             datetime64[ns]
invoice_currency                object
total_open_amount              float64
baseline_create_date    datetime64[ns]
cust_payment_terms              object
dtype: object


In [187]:
main_train['cust_payment_terms'] = np.where(main_train['cust_payment_terms'].isin(['NAA8','NAH4','CA10','NAC6','NAM4']),main_train['cust_payment_terms'],'OTHR')    

In [188]:
main_train['total_open_amount'] = np.where(main_train['invoice_currency'].isin(['USD']),main_train['total_open_amount'],main_train['total_open_amount']*0.81)

In [189]:
main_train['target'] = main_train['clear_date'] - main_train['document_create_date']

In [191]:
main_train['expected'] = main_train['due_in_date'] - main_train['document_create_date']

In [194]:
main_train.head()

Unnamed: 0,business_code,cust_number,clear_date,buisness_year,doc_id,posting_date,document_create_date,due_in_date,invoice_currency,total_open_amount,baseline_create_date,cust_payment_terms,target,expected
0,U001,200779906,2019-03-28,2019.0,1928941000.0,2019-03-12,2019-03-10,2019-03-27,USD,44468.83,2019-03-12,NAA8,18 days,17 days
1,U001,200769623,2019-04-03,2019.0,1929015000.0,2019-03-24,2019-03-23,2019-04-08,USD,4013.75,2019-03-24,NAH4,11 days,16 days
2,U001,200772670,2019-05-22,2019.0,1929268000.0,2019-05-06,2019-05-06,2019-05-21,USD,77909.09,2019-05-06,OTHR,16 days,15 days
3,U001,100044041,2020-03-02,2020.0,1930548000.0,2020-02-20,2020-02-20,2020-03-10,USD,2459.37,2020-02-16,NAM4,11 days,19 days
4,U001,200705742,2019-05-15,2019.0,1929234000.0,2019-04-30,2019-04-30,2019-06-03,USD,10073.68,2019-04-30,OTHR,15 days,34 days


In [195]:
from sklearn.preprocessing import LabelEncoder