### B2B- INVOICE PAYMENT DATE PREDICTION MODEL - BY MISHA DEY 

In [1]:
import gc
import math
import PIL 
import pandas as pd
import numpy as np
import seaborn 
import datetime
import random
import warnings
import xgboost as xgb
from scipy import stats
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split , GridSearchCV,cross_val_score,cross_val_predict,cross_validate,RandomizedSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score,max_error,r2_score,median_absolute_error,mean_squared_log_error
from sklearn.feature_selection import VarianceThreshold,SelectKBest,f_regression
from sklearn.preprocessing import MinMaxScaler,normalize,StandardScaler,RobustScaler
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.decomposition import PCA
import featuretools as ft
from sklearn.svm import SVR
from mlxtend.feature_selection import SequentialFeatureSelector,ExhaustiveFeatureSelector

ModuleNotFoundError: No module named 'featuretools'

In [None]:
# Reading the csv file
data = pd.read_csv("dataset.csv")

### EXPLORATORY  DATA   ANALYSIS

In [None]:
data

In [None]:
data.shape

In [None]:
# Data Description
data.describe()

In [None]:
# Dataset olumns
data.columns

#### Frequency Table

In [2]:
coun_t  = data.nunique().to_frame(name = 'Counts')
coun_t

#### Bar Plot Representing the Frequency Counts

In [None]:
plt.style.use('dark_background')

x_val = coun_t.index
y_val = coun_t.values.reshape(coun_t.shape[0])
plt.figure(figsize = (35,10))
plt.bar(x_val,y_val,color='maroon')
plt.show()

#### Mode - Most Frequently occuring Element

In [None]:
data[['business_code','buisness_year','invoice_currency','document type','cust_payment_terms','isOpen']].agg(pd.Series.mode)        

#### Median - Ordinal Variables ( 'invoice_id' ,buisness_year, isOpen )

In [None]:
data[['invoice_id','buisness_year','invoice_currency','isOpen']].agg(np.median).to_frame()         

#### Mean and Standard Deviation - interval level variables (total_open_amount)

In [None]:
data['total_open_amount'].agg(['mean','std']).to_frame()

### DATA PRE-PROCESSING

#### REMOVING 'NULL' VALUES FROM THE DATAFRAME

In [None]:
null_vals = data.isnull().sum().to_frame(name = 'NULL_COUNTS')
null_vals = null_vals[null_vals['NULL_COUNTS']>0]
null_features = null_vals.index
null_vals

In [None]:
plt.title("NULL VALUES IN DATA")
seaborn.barplot(x=null_features,y=null_vals['NULL_COUNTS'])

In [None]:
# Analysis: "area_business" has all NaN values --> hence we drop that first
data=data.drop(columns=['area_business'])
data.shape

In [None]:
test_dataset = data[data['clear_date'].isnull() == True]
data = data.dropna()

In [None]:
test_dataset.shape

In [None]:
data.isnull().sum()  # All the rows with NULL values are dropped

#### REMOVING DUPLICATE VALUES FROM DATAFRAME 

In [None]:
print("Percentage of data lost due to dropping the NULL values in the dataframe : ",((50000-data.shape[0])/50000)*100 ,"%")  

In [None]:
#Dealing with the duplicates in the dataframe
data.duplicated().sum() # No Duplicate Values

In [None]:
test_dataset.duplicated().sum()

In [None]:
#looking for duplicated columns
data_trans = data.T
data_trans.duplicated()

In [None]:
#looking for duplicated columns
test_dataset_trans = test_dataset.T
test_dataset_trans.duplicated()

In [None]:
(data['doc_id'] != data['invoice_id']).sum() # Hence column 'doc_id' and 'invoice_id' are identical

In [None]:
# We are hence dropping the 'doc_id' column and some other unnecessary columns 
data = data.drop(columns=['doc_id'])  

In [None]:
test_dataset = test_dataset.drop(columns=['doc_id'])

In [None]:
gc.collect()
data.shape

In [3]:
test_dataset.shape

### Removing the column unique to each row : 'invoice_id'

In [None]:
data = data.drop(columns = 'invoice_id')
test_dataset = test_dataset.drop(columns = 'invoice_id')

### Removing the insignificant Columns

In [None]:
data1 = data.copy()
test_dataset1 = test_dataset.copy()

In [None]:
data.columns

In [None]:
data = data.drop(columns = ['name_customer', 'posting_date','document_create_date'])


In [None]:
test_dataset = test_dataset.drop(columns =  ['name_customer', 'posting_date','document_create_date'])

In [None]:
list(data.columns)

### FILTERING OUT THE CONSTANT FEATURES

In [None]:
const_feature = []
uniq_val_count = []
unique_cols = dict()
for col in list(data.columns):
    uniq_val_count.append(data[col].nunique())
    if(data[col].nunique()==1):
        const_feature.append(col)
    
print('\n\n\nConstant Features are   :',const_feature)
print('\n\nALL FEATURES WITH UNIQUE VALUES : \n')
pd.DataFrame({'COLUMN NAMES':list(data.columns) ,'UNIQUE VALUES COUNT':uniq_val_count})


# Removing the constant feature
data=data.drop(columns=const_feature)

# Removing the constant feature
test_dataset=test_dataset.drop(columns=const_feature)

In [None]:
list(data.columns)

### Label Encoding across categorical columns : [ 'business_code' ,'cust_number'  , 'cust_payment_terms']

In [None]:
data['business_code'].unique()

In [None]:
data['cust_number'].unique()

In [None]:
data['cust_payment_terms'].unique()

In [None]:
data['buisness_year'].unique()

In [None]:
### Assign 0 for unknown classes

class LabelEncoderExt(object):
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, data_list):
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_
        return self

    def transform(self, data_list):
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]
        return self.label_encoder.transform(new_data_list)
    
    

In [None]:
list_cust_details = ['buisness_year','cust_number','business_code','cust_payment_terms']

label_enc_list = dict()
for col in range(len(list_cust_details)):
    label_encoder = LabelEncoderExt()
    label_encoder.fit(data[list_cust_details[col]])
    data[list_cust_details[col]] = label_encoder.transform(data[list_cust_details[col]])
    label_enc_list[list_cust_details[col]]=label_encoder


In [None]:
test_dataset.columns

In [None]:
for col in range(len(list_cust_details)):
    test_dataset[list_cust_details[col]] = label_enc_list[list_cust_details[col]].transform(test_dataset[list_cust_details[col]])

In [None]:
for col in list_cust_details:
    print(col,"   :     \n",data[col].unique(),"   \n     \n")

#### DATE-TIME CONNVERSION

In [None]:
data

In [None]:
def str_to_date(time_s,st):
    st=''
    for i in range(len(time_s)+3):
        if(i<4):
            st+=time_s[i]
        if(i==4):
            st+='-'
        if(i==5 or i==6):
            st+= time_s[i-1]
        if(i==7):
            st+='-'
        if(i==8 or i==9):
            st+=time_s[i-2]
    return st  #### REMOVING 'NULL' VALUES FROM THE DATAFRAME

In [None]:
dt_lis = ['baseline_create_date','clear_date','due_in_date','document_create_date.1']

st = ''
data['baseline_create_date'] = pd.Series(data['baseline_create_date']).map(lambda x: str_to_date(str(int(x)),st))
data['due_in_date'] = pd.Series(data['due_in_date']).map(lambda x: str_to_date(str(int(x)),st))
data['document_create_date.1'] = pd.Series(data['document_create_date.1']).map(lambda x: str_to_date(str(int(x)),st))

In [None]:
st = ''
test_dataset['baseline_create_date'] = pd.Series(test_dataset['baseline_create_date']).map(lambda x: str_to_date(str(int(x)),st))
test_dataset['due_in_date'] = pd.Series(test_dataset['due_in_date']).map(lambda x: str_to_date(str(int(x)),st))
test_dataset['document_create_date.1'] = pd.Series(test_dataset['document_create_date.1']).map(lambda x: str_to_date(str(int(x)),st))

In [None]:
dt_lis = ['baseline_create_date','clear_date','due_in_date','document_create_date.1']

for col in dt_lis:
    data[col] = pd.to_datetime(data[col],format='%Y-%m-%d')
data.shape

In [None]:
for col in dt_lis:
    test_dataset[col] = pd.to_datetime(test_dataset[col],format='%Y-%m-%d')
test_dataset.shape

### FILTERING OUT THE QUASI-CONSTANT FEATURES

In [None]:
data.columns

In [None]:
selection = VarianceThreshold(threshold=0.01) # of more that 99% values are same -- we remove the column

cols = list(set(data.columns) - (set(dt_lis) | set(['invoice_currency'])))

selection.fit(data[cols])

print("No. of Features that are Quasi-Constant : ",(len(cols) - sum(selection.get_support())))

quasi_ = list(selection.get_support())

for i in range(len(quasi_)):
    if quasi_[i] == False:
        print("The Quasi-Constant Feature in train data is  :",cols[i])
        data=data.drop(columns=[cols[i]])

for i in range(len(quasi_)):
    if quasi_[i] == False:
        print("The Quasi-Constant Feature in test data is  :",cols[i])
        test_dataset=test_dataset.drop(columns=[cols[i]])


### DROPPING THE COLUMNS WHERE THE DOCUMENT FOR INVOICES ARE CREATED BEFORE THE BASELINE CREATE DATE

In [None]:
temp = pd.Series(data['document_create_date.1'] - data['baseline_create_date']).dt.days

In [None]:
rows_to_drop = temp.value_counts().to_frame(name = 'count')

In [None]:
# No. of rows to drop satisfying the condition
rows_to_drop[(rows_to_drop.index <0)].sum()

In [None]:
data = data[((temp==0) | (temp>0))].reset_index(drop=True)

In [None]:
data.shape

In [None]:
data = data.drop(columns = ['document_create_date.1'])

In [None]:
test_dataset = test_dataset.drop(columns = ['document_create_date.1'])

### FEATURE ENGINEERING / FEATURE GENERATION

In [None]:
dates_list = ['due_in_date','baseline_create_date'] #payment date is excluded  

In [4]:
data.columns

In [None]:
data['payment_term']=pd.Series(data['clear_date'] - data['baseline_create_date']).dt.days
data['due_term']=pd.Series(data['due_in_date'] - data['baseline_create_date']).dt.days
data['delay'] = data['payment_term'] - data['due_term']
test_dataset['due_term']=pd.Series(test_dataset['due_in_date'] - test_dataset['baseline_create_date']).dt.days

### BUCKET COMPUTATION - > BASED ON DUE TERM, PAYMENT TERM , AND DELAY

In [None]:
def bucketization(x):
    if x<0:
        return('(< 0) Days')
    if x in range(0,16):
        return('(0-15) Days')
    elif x in range(16,31):
        return('(16-30) Days')
    elif x in range(31,45):
        return('(31-45) Days')
    elif x in range(45,60):
        return('(45-60) Days')
    else:
        return('(> 60) Days')

In [None]:
gc.collect()

data = data.reset_index(drop=True)

data['DELAY BUCKET(DAYS)']= pd.Series([bucketization(x=data['delay'][i])  for i in range(len(data))])
data['DUE TERM BUCKET(DAYS)']= pd.Series([bucketization(x=data['due_term'][i])  for i in range(len(data))])


bucket_mapper = {'(< 0) Days':0,'(0-15) Days':1,'(16-30) Days':2,'(31-45) Days':4,'(45-60) Days':5,'(> 60) Days':6}

data['delay_bucket_id'] = data['DELAY BUCKET(DAYS)'].map(bucket_mapper)
data['due_term_bucket_id'] = data['DUE TERM BUCKET(DAYS)'].map(bucket_mapper)

data.columns

In [None]:
test_dataset = test_dataset.reset_index(drop=True)
test_dataset['DUE TERM BUCKET(DAYS)']= pd.Series([bucketization(x=test_dataset['due_term'][i])  for i in range(len(test_dataset))])
test_dataset['due_term_bucket_id'] = test_dataset['DUE TERM BUCKET(DAYS)'].map(bucket_mapper)

In [None]:
test_dataset.columns

### Grouping on the basis of features: 'cust_number'and generating new features

In [None]:
test_dataset=test_dataset.drop(columns=['clear_date'])

In [None]:
for col1 in ['baseline_create_date','due_in_date']:
    data['{}.day'.format(col1)]=data[col1].dt.day
    data['{}.year'.format(col1)]=data[col1].dt.year
    data['{}.month'.format(col1)]=data[col1].dt.month
    test_dataset['{}.day'.format(col1)]=test_dataset[col1].dt.day
    test_dataset['{}.year'.format(col1)]=test_dataset[col1].dt.year
    test_dataset['{}.month'.format(col1)]=test_dataset[col1].dt.month
    test_dataset['{}.day_of_week'.format(col1)]=test_dataset[col1].dt.dayofweek

In [None]:
df1 = data.groupby('cust_number').sum()
df = df1.rename(columns = {'total_open_amount':'Sum_base_amount'})['Sum_base_amount']
df2 = df1.rename(columns = {'due_term':'Sum_due_term'})['Sum_due_term']

data = pd.merge(data,df,on = 'cust_number' )
data = pd.merge(data,df2,on = 'cust_number' )

df = data.groupby('cust_number').mean()
df = df.rename(columns = {'total_open_amount':'mean_base_amount','due_term':'mean_due_term'})
data = pd.merge(data,df['mean_base_amount'],on = 'cust_number' )
data = pd.merge(data,df['mean_due_term'],on = 'cust_number' )

data['amount/mean_amount'] = data['total_open_amount']/data['mean_base_amount']
data['amount-/mean_amount'] = (data['total_open_amount']-data['mean_base_amount'])/data['mean_base_amount']
data['due_term/amount'] = data['due_term']/data['total_open_amount']
data['mean_due_term/amount'] = data['mean_due_term']/data['total_open_amount']
data['mean_due_term/Sum_base_amount'] = data['mean_due_term']/data['Sum_base_amount']

In [None]:
df1 = test_dataset.groupby('cust_number').sum()
df = df1.rename(columns = {'total_open_amount':'Sum_base_amount'})['Sum_base_amount']
df2 = df1.rename(columns = {'due_term':'Sum_due_term'})['Sum_due_term']
test_dataset = pd.merge(test_dataset,df,on = 'cust_number' )
test_dataset = pd.merge(test_dataset,df2,on = 'cust_number' )

df = test_dataset.groupby('cust_number').mean()
df = df.rename(columns = {'total_open_amount':'mean_base_amount','due_term':'mean_due_term'})
test_dataset = pd.merge(test_dataset,df['mean_base_amount'],on = 'cust_number' )
test_dataset = pd.merge(test_dataset,df['mean_due_term'],on = 'cust_number' )

test_dataset['amount/mean_amount'] = test_dataset['total_open_amount']/test_dataset['mean_base_amount']
test_dataset['amount-/mean_amount'] = (test_dataset['total_open_amount']- test_dataset['mean_base_amount'])/test_dataset['mean_base_amount']
test_dataset['due_term/amount'] = test_dataset['due_term']/test_dataset['total_open_amount']
test_dataset['mean_due_term/amount'] = test_dataset['mean_due_term']/test_dataset['total_open_amount']
test_dataset['mean_due_term/Sum_base_amount'] = test_dataset['mean_due_term']/test_dataset['Sum_base_amount']

In [None]:
df = dict(zip(list(data['cust_number'].value_counts().index),list(data['cust_number'].value_counts())))

data['cust_count'] = data['cust_number'].map(df)
data['cust_count/mean_amount'] = data['cust_count']/data['mean_base_amount']

test_dataset['cust_count'] = test_dataset['cust_number'].map(df)
test_dataset['cust_count/mean_amount'] = test_dataset['cust_number']/test_dataset['mean_base_amount']

### Sorting the dataframe according to the'baseline_create_date'

In [None]:
data = data.sort_values(['baseline_create_date']).reset_index(drop=True)
data['cust_count*due_term/amount'] =(data['cust_count']*data['due_term'])/data['total_open_amount']

In [None]:
test_dataset = test_dataset.sort_values(['baseline_create_date']).reset_index(drop=True)
test_dataset['cust_count*due_term/amount'] =(test_dataset['cust_count']*test_dataset['due_term'])/test_dataset['total_open_amount']

In [None]:
data.isnull().sum()

 ### MANIPULATING THE 'AMOUNT COLUMN - CONVERTING TO USD
##### CAD -> CANADIAN DOLLAR 
##### USD -> US DOLLAR 

##### Convertion:

##### 1 CAD = 0.79 USD

In [None]:
x = data[data['invoice_currency']=='CAD']['total_open_amount'].map(lambda x: x*0.79)    

for i in x.index:
    data['total_open_amount'][i] = 0.79*x[i]


data = data.drop(columns=['invoice_currency'])

x = test_dataset[test_dataset['invoice_currency']=='CAD']['total_open_amount'].map(lambda x: x*0.79)    

for i in x.index:
    test_dataset['total_open_amount'][i] = 0.79*x[i]
test_dataset = test_dataset.drop(columns=['invoice_currency'])  

In [None]:
test_dataset = test_dataset.sort_values(['baseline_create_date']).reset_index(drop=True)
test_dataset

In [None]:
cols_drop1 = set(data.columns) - set(test_dataset.columns)
cols_drop1

In [None]:
data.corr()['payment_term']

In [None]:
# Dropping the date columns

#data = data.drop(columns = ['clear_date', 'due_in_date','baseline_create_date'])
#test_dataset = test_dataset.drop(columns = ['due_in_date','baseline_create_date'])

#### LOG TRANSFORMATION OF 'total_open_amount'

In [None]:
data['total_open_amount'] = np.log(data['total_open_amount'])

In [None]:
test_dataset['total_open_amount'] = np.log(test_dataset['total_open_amount'])

In [None]:
# Looking For the values of categorical features which are diffrent from train data
list_cust_details = ['cust_number','business_code','cust_payment_terms']

for col in list_cust_details:
    coun = 0
    for i in range(len(test_dataset)):
        if test_dataset[col][i] in set(test_dataset[col])-set(data[col]):
            coun+=1
    print("\n\n\nColumn Name:    ",col,"\nTotal Unique Values in train-set:     ",data[col].nunique(),"\nNew Valuess introduced in Test:     ",len(set(test_dataset[col].unique()) - set(data[col].unique())),"\nValues are:",set(test_dataset[col])-set(data[col]),"\nCount :",coun)

In [None]:
data.columns

In [None]:
gc.collect()

### CORRELATION MATRIX

In [None]:
d_corr=data.corr()
d_corr

### COVARIANCE MATRIX

In [None]:
data.cov()

### PIE PLOT

In [None]:
for col in ['business_code','cust_payment_terms', 'delay_bucket_id','due_term_bucket_id']:
    plt.style.use('classic')
    fig,ax = plt.subplots(figsize=(10,10))
    plt.pie(data[col].value_counts(),labels=data[col].unique(),shadow=True,autopct='%1.1f%%')
    plt.title('\nCount of {}'.format(col))
    plt.show()

 #### BAR PLOT

In [None]:
features = ['business_code','cust_payment_terms', 'delay_bucket_id','due_term_bucket_id']

for col in features:
    plt.style.use('dark_background')
    fig,ax = plt.subplots(figsize=(10,8))
    plt.bar(list(data[col].value_counts().index),list(data[col].value_counts()),color = random.sample(['maroon','yellow'],1))            
    plt.title('\nCount of {}'.format(col))
    plt.figure(figsize=(20,20))
    plt.show()

#### BOX PLOT 

In [None]:
features = ['business_code','cust_payment_terms', 'delay_bucket_id','due_term_bucket_id']

for col in features:
    plt.style.use('dark_background')
    seaborn.boxplot(data[col],color='maroon',notch=True)
    plt.title('Box Plot of {}'.format(col))
    plt.figure(figsize=(20,20))
    plt.show()

#### Scatter Plot

In [None]:
data.columns

#### Scatter Plot

In [None]:
x_val = ['cust_number','business_code', 'total_open_amount', 'cust_payment_terms']
y_val = 'payment_term'
for col in x_val:
    plt.scatter(data[col],data['delay'],color=random.sample(['yellow','maroon'],1),linewidth = .5)
    plt.title('{} V/S {}'.format(col,y_val))
    plt.xlabel(col)
    plt.ylabel(y_val)
    plt.show()

In [None]:
data.nunique()

#### DISTRIBUTION PLOTS

In [None]:
for col in list(set(data.columns) - set(['DELAY BUCKET(DAYS)' ,'DUE TERM BUCKET(DAYS)'])):
    seaborn.distplot(data[col],color='maroon')
    plt.show()

In [None]:
data_copy = data.copy()
test_dataset_copy = test_dataset.copy()

 ### OUTLIER DETECTION AND REMOVAL

clear_date-due_in_date(Payment term) > 100


15 > total_open_amount > 1 


-45 < delay <50


custumer_num  >50

In [None]:
data.describe()

In [None]:
pay_max = 80
data[data['payment_term']>=pay_max]['payment_term'].value_counts().sum()

In [None]:
data = data[(data['total_open_amount'] > 1) & (data['payment_term']<=pay_max )].reset_index(drop=True)

In [None]:
data.shape

In [5]:
test_dataset.shape

#### Distribution after removing outliers

In [None]:
for col in list(set(data.columns) - set(['DELAY BUCKET(DAYS)' ,'DUE TERM BUCKET(DAYS)'])):
    seaborn.distplot(data[col],color='maroon')
    plt.show()

###  Diffrent Features

In [None]:
print(list(data.columns))
print("\n\n",len(list(data.columns)))

In [None]:
categorical_features = ['business_code', 'cust_payment_terms','due_term_bucket_id']

In [None]:
numerical_features = list(set(set(data.columns) - set(['DELAY BUCKET(DAYS)', 'DUE TERM BUCKET(DAYS)', 'delay_bucket_id', 'due_term_bucket_id','clear_date', 'due_in_date', 'baseline_create_date'])) - set(categorical_features))

In [None]:
len(numerical_features)

In [None]:
len(categorical_features)                                               

###  Scaling the Numerical features

In [None]:
infos_df = data[['clear_date', 'due_in_date', 'baseline_create_date','total_open_amount']]

In [None]:
train_num = data[list(set(numerical_features)   -  set(['payment_term','delay']))]
test_num = test_dataset[list(set(numerical_features)   -  set(['payment_term','delay']))]
train_cat = data[categorical_features]
test_cat = test_dataset[list(set(categorical_features))]

In [None]:
scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
final_train_n = pd.DataFrame(scaler.fit_transform(train_num[list(set(numerical_features)-set(['payment_term','delay']))]),columns=list(set(numerical_features)-set(['payment_term','delay'])))
final_test_n = pd.DataFrame(scaler.fit_transform(test_num),columns=list(set(numerical_features)-set(['payment_term','delay'])))
data['delay'] = y_scaler.fit_transform(np.array(data['delay']).reshape(data['delay'].shape[0],1))

In [None]:
num_cols = final_train_n.columns

In [None]:
final_train_n=pd.merge(final_train_n,data['cust_number'],left_index=True,right_index=True)
final_test_n=pd.merge(final_test_n,test_dataset['cust_number'],left_index=True,right_index=True)

In [None]:
final_test_n=pd.merge(final_test_n,infos_df.drop(columns = ['clear_date']),left_index=True,right_index=True)

In [6]:
final_train_n=pd.merge(final_train_n,infos_df,left_index=True,right_index=True)

In [None]:
final_test_n.shape

In [None]:
final_train_n.shape

#### One-Hot Encoding the Categorical Data

In [None]:
data_cat=pd.concat((train_cat,test_cat),sort=False).reset_index(drop=True)
data_cat

In [None]:
data_cat1 = pd.get_dummies(data_cat,prefix='enc_c_',columns = ['business_code'],drop_first=False)
data_cat2 = pd.get_dummies(data_cat,prefix='enc_c___',columns = ['cust_payment_terms'],drop_first=False)
data_cat1 = pd.merge(data_cat1,data_cat2,left_index=True,right_index=True)
data_cat2 = pd.get_dummies(data_cat,prefix='enc_c',columns = ['due_term_bucket_id'],drop_first=False)
data_cat1 = pd.merge(data_cat1,data_cat2,left_index=True,right_index=True)

data_cat1 = data_cat1.drop(columns =['cust_payment_terms_x', 'due_term_bucket_id_x','due_term_bucket_id_y','business_code_y','cust_payment_terms_y','business_code_x'])    

In [None]:
data[['business_code','cust_payment_terms','due_term_bucket_id']].nunique().sum()

In [None]:
data_cat1.columns

In [None]:
final_train_c = data_cat1[:data.shape[0]].reset_index(drop=True)
final_test_c = data_cat1[data.shape[0]:].reset_index(drop=True)

In [None]:
#final_train_c = train_cat
#final_test_c = test_cat

In [None]:
final_train_c.shape

In [None]:
final_test_c.shape

### Merging the Numerical, Categorical and Numerical-Categorical Data

In [None]:
test_data = final_test_n.merge(final_test_c.reset_index(drop=True),left_index=True,right_index=True)
#test_data = test_data.merge(final_test_c.reset_index(drop=True),left_index=True,right_index=True)
test_data

In [None]:
train_data = final_train_n.merge(final_train_c.reset_index(drop=True),left_index=True,right_index=True)
train_data = train_data.merge(data['delay'].reset_index(drop=True),left_index=True,right_index=True)
train_data

In [7]:
set(train_data.columns) - set(test_data.columns)

### GENERATING THE TEST-TRAIN-EVALUATION DATA

In [None]:
train_data = train_data.copy()
data_copy = data.copy()

In [None]:
x_data = train_data[set(train_data.columns) - set(['delay'])]
x_data.shape

In [None]:
x_test = test_data[set(test_data.columns) - set(['delay'])]
x_test.shape

In [None]:
y_data = train_data['delay']
y_data.shape

In [None]:
eval_range = (math.ceil(data.shape[0]*(0.25)))+1
x_train,x_eval,y_train,y_eval = x_data[eval_range:].reset_index(drop=True),x_data[0:eval_range].reset_index(drop=True),y_data[eval_range:].reset_index(drop=True),y_data[0:eval_range].reset_index(drop=True)       

In [None]:
eval_range = (math.ceil(x_eval.shape[0]*(0.4)))+1
x_eval1,x_eval2,y_eval1,y_eval2 = x_eval[eval_range:].reset_index(drop=True),x_eval[0:eval_range].reset_index(drop=True),y_eval[eval_range:].reset_index(drop=True),y_eval[0:eval_range].reset_index(drop=True)

In [None]:
x_eval1.columns

In [None]:
x_train_info = x_train[['clear_date','total_open_amount_y','cust_number_y','baseline_create_date','due_in_date']]
x_eval1_info = x_eval1[['clear_date','total_open_amount_y','cust_number_y','baseline_create_date','due_in_date']]
x_eval2_info = x_eval2[['clear_date','total_open_amount_y','cust_number_y','baseline_create_date','due_in_date']]
x_test_info = x_test[['total_open_amount_y','cust_number_y','baseline_create_date','due_in_date']]

x_train = x_train.drop(columns=['clear_date','baseline_create_date','due_in_date'])
x_eval1 = x_eval1.drop(columns=['clear_date','baseline_create_date','due_in_date'])
x_eval2 = x_eval2.drop(columns=['clear_date','baseline_create_date','due_in_date'])
x_test = x_test.drop(columns=['baseline_create_date','due_in_date'])

In [None]:
x_train.shape,y_train.shape

In [None]:
x_eval1.shape,y_eval1.shape

In [None]:
x_eval2.shape,y_eval2.shape

In [None]:
x_test.shape

In [None]:
#y_scaler.inverse_transform(np.array(y_eval).reshape(y_eval.shape[0],1))

In [None]:
#To write the final created dataset to the working directory
x_train.to_csv('x_train.csv')
x_test.to_csv('x_test.csv')
y_train.to_csv('y_train.csv')

x_eval1.to_csv('x_eval1.csv')
x_eval2.to_csv('x_eval2.csv')
y_eval1.to_csv('y_eval1.csv')
y_eval2.to_csv('y_eval2.csv')

### FEATURE SELECTION

In [None]:
gc.collect()

In [None]:
def select_features(x_train,y_train,x_test,x_eval1,x_eval2):
    fs = SelectKBest(score_func = f_regression,k=10)
    fs.fit(x_train,y_train)
    col_indices = fs.get_support(indices=True)
    x_features = x_train.columns[col_indices]
    x_train_fs = pd.DataFrame(fs.transform(x_train),columns=x_features)
    x_test_fs = pd.DataFrame(fs.transform(x_test),columns=x_features)
    x_eval1_fs = pd.DataFrame(fs.transform(x_eval1),columns=x_features)
    x_eval2_fs = pd.DataFrame(fs.transform(x_eval2),columns=x_features)
    return x_train_fs,x_test_fs,x_eval1_fs,x_eval2_fs,fs,x_features,col_indices

In [None]:
#x_train_fs,x_test_fs,x_eval1_fs,x_eval2_fs,fs,x_features,col_indices = select_features(x_train,y_train,x_test,x_eval1,x_eval2)

In [None]:
#for i in range(len(fs.scores_[col_indices])):
#    print("\n",x_features[i],"    ",fs.scores_[col_indices][i])

In [None]:
#selected_features = list(set(train_data.corr()['delay'][train_data.corr()['delay']>0.5].index) - set(['delay']) )
#selected_features

## MODEL TRAINING , PREDICTION AND EVALUATION

In [None]:
x_train.columns

In [None]:
def evaluate_metrics(x,y_true,y_pred,mod):
    mean_abs_error = mean_absolute_error(y_true,y_pred)
    mean_sq_error = mean_squared_error(y_true,y_pred)
    root_mean_sq_error = mean_squared_error(y_true,y_pred)**0.5
    r2_scr = r2_score(y_true,y_pred)
    median_abs_score = median_absolute_error(y_true,y_pred)
    explained_variance = explained_variance_score(y_true,y_pred)
    return mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance  

In [None]:
data.corr()['payment_term']

In [None]:
regression_algo = ['1. LINEAR REGRESSION','2. RIDGE REGRESSION','3. LASSO REGRESSION','4. RANDOM FOREST REGRESSION','5. SUPPORT VECTOR REGRESSION(RBF)','6. DECISION TREE REGRESSION','7. XBF REGRESSION'] 

metric_names = ["MEAN ABSOLUTE ERROR","SCORE","MEAN SQUARED ERROR","ROOT MEAN SQUARED ERROR","R2 SCORE","MEDIAN ABSOLUTE ERROR","EXPLAINED VARIANCE SCORE"]

reg_metrics = []

### 1. LINEAR REGRESSION

In [None]:
gc.collect()

In [None]:
model1 = LinearRegression()
parameters = {'fit_intercept':[True,False],'n_jobs':[1,2,3],'normalize':[True,False]}
grid = GridSearchCV(estimator = model1,param_grid= parameters,scoring = 'neg_mean_squared_error',verbose = 1)     
grid_results = grid.fit(x_train,y_train)
print('BEST SCORE:  ',grid_results.best_score_)
print('BEST PARAMS:  ',grid_results.best_params_)

In [None]:
model1 = LinearRegression(fit_intercept = grid_results.best_params_['fit_intercept'],n_jobs = grid_results.best_params_['n_jobs'])
model1.fit(x_train,y_train)
y_pred1 = model1.predict(x_eval1)
y_pred1 = y_pred1.reshape(y_pred1.shape[0])
y_eval1 = np.array(y_eval1)
y_predicted = y_scaler.inverse_transform(y_pred1.reshape(y_pred1.shape[0],1)).reshape(y_pred1.shape[0])
y_true = y_scaler.inverse_transform(y_eval1.reshape(y_eval1.shape[0],1)).reshape(y_eval1.shape[0])
mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,model1)
scr = model1.score(x_eval1,y_eval1)
reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])

print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)    

In [None]:
reg_metrics

In [None]:
def plot_regression_line(x_test,y_test,y_pred):
    for col in x_test.columns:
        #plt.scatter(x_test[col],y_test,color='red')
        plt.plot(x_test[col],y_pred1,color='yellow',linewidth = 0.5)
        plt.xlim(-0.00000000000001,0.000000000000001)
        plt.xlabel(col)
        plt.ylabel('delay')
        plt.title('{} V/S {} Regression Line\n'.format(col,'delay'))
        plt.show()

In [None]:
#plot_regression_line(x_eval1,y_eval1,y_pred1)

### 2. REGULARIZATION AND EVALUATION : RIDGE REGRESSION

In [None]:
ridge = Ridge()
parameters = {'alpha':[x for x in [0.0005,0.0001,0.00021,0.0006,0.1,0.001,0.005,0.008,0.1,0.5,1,0.1,0.09,0.08,0.06,0.05,0.03,0.01,0.02,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]]}  

ridge_reg = GridSearchCV(ridge,param_grid = parameters,verbose=1)
ridge_reg = ridge_reg.fit(x_train,y_train)

alpha = ridge_reg.best_params_['alpha']

print("\n\nBest Alpha:  ",ridge_reg.best_params_,"\nScore:  ",ridge_reg.best_score_)

ridge_mod = Ridge(alpha=alpha)
ridge_mod=ridge_mod.fit(x_train,y_train)
y_pred1 = ridge_mod.predict(x_eval1)
y_pred2 = ridge_mod.predict(x_eval2)


mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,ridge_mod)
scr = ridge_mod.score(x_eval1,y_eval1)
print("\n\n\nMETRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)



In [None]:
reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])
reg_metrics

In [None]:
# hyper parameter tuninhg using the evaluation set

ridge_reg = GridSearchCV(ridge,param_grid = parameters,verbose=1)
ridge_reg = ridge_reg.fit(x_eval1,y_eval1)
alpha = ridge_reg.best_params_['alpha']

print("\n\nBest Alpha:  ",ridge_reg.best_params_,"\nScore:  ",ridge_reg.best_score_)


In [None]:
# For 2nd Evaluation set

mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval2,y_eval2,y_pred2,ridge_mod)
scr = ridge_mod.score(x_eval2,y_eval2)
print("\n\n\nMETRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)    

### 3. REGULARIZATION AND EVALUATION : LASSO REGRESSION

In [None]:
# Hyper Parameter tuning using the train data

lasso_reg = Lasso()
parameters = {'alpha':[x for x in [0.0005,0.0001,0.00021,0.0006,0.1,0.001,0.005,0.008,0.1,0.5,1]]}  
lasso_reg = GridSearchCV(lasso_reg,param_grid=parameters,verbose=1)
lasso_reg.fit(x_train,y_train)

print("\n\n\nBest Alpha : ",lasso_reg.best_params_,"\nBest Score : ",lasso_reg.best_score_)

lasso_reg = Lasso(alpha=0.0001)
lasso_reg = lasso_reg.fit(x_train,y_train)
y_pred1 = np.array(lasso_reg.predict(x_eval1))

mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1.reshape(y_eval1.shape[0],1),y_pred1.reshape(y_pred1.shape[0],1),ridge_mod)
scr = lasso_reg.score(x_eval1,y_eval1)
reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])

print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)    

In [None]:
# Hyper Parameter tuning using the 1st evaluation Data

parameters = {'alpha':[x for x in [0.0005,0.0002,0.0003,0.0004,0.009,0.0001,0.00021,0.0006,0.1,0.001,0.005,0.008,0.1,0.5,1]]}  
lasso_reg = GridSearchCV(lasso_reg,param_grid=parameters)
lasso_reg.fit(x_eval1,y_eval1)

print("\n\n\nBest Alpha : ",lasso_reg.best_params_,"\nBest Score : ",lasso_reg.best_score_)

lasso_reg = Lasso(alpha=0.0001)
lasso_reg = lasso_reg.fit(x_train,y_train)    

In [None]:
## We still get the best value of alpha as : 0.0001.. So ther e is not much difference


# Evaluating on the 2nd Evaluation Data

y_pred2 = np.array(lasso_reg.predict(x_eval2))

mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval2,np.array(y_eval2).reshape(np.array(y_eval2).shape[0],1),y_pred2.reshape(y_pred2.shape[0],1),ridge_mod)
scr = lasso_reg.score(x_eval2,y_eval2)
#reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])

print("\n\n1st Evaluation Data\n\nMETRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)    

 ### 4. RANDOM FOREST REGRESSOR

In [None]:
gc.collect()

In [None]:
%%time

# Performing Grid-Search
#gsc = GridSearchCV(estimator = RandomForestRegressor(),param_grid={'max_depth':[9,10],'n_estimators':(500,1000)},cv=5,scoring = 'neg_mean_squared_error',verbose=1,n_jobs=-1)
#grid_result = gsc.fit(x_train,y_train)
#best_params = grid_result.best_params_
#print("\n\nBest Params : \n",best_params)

rfr = RandomForestRegressor(max_depth=10,n_estimators=500,random_state=False,verbose=True) 

rfr.fit(x_train,y_train)

y_pred1 = rfr.predict(x_eval1)

scr = rfr.score(x_eval1,y_eval1)

mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,rfr)
reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])

print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)    

In [None]:
# Evaluating in with the 2nd evaluation data

y_pred2 = rfr.predict(x_eval2)

scr = rfr.score(x_eval2,y_eval2)

mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval2,np.array(y_eval2).reshape(np.array(y_eval2).shape[0],1),y_pred2,rfr)
print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)    

In [None]:
reg_metrics

### 5. SUPPORT VECTOR MACHINES: REGRESSION

### Linear Kernel

In [None]:
# C: regulation parameter --> inversely proportional to regularization
# 'epsilon' --> It specifies the epsilon-tube within whic LPVh no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value.

In [None]:
%%time

#gsc1 = GridSearchCV(estimator=SVR(kernel='linear'),param_grid={'C':[0.1,1,10,100,1000],'epsilon':[10,5,1,0.1,0.5,0.01,0.001,0.0001,0.00001]},cv=5,scoring='neg_mean_squared_error',verbose=0,n_jobs=-1)      
#grid_result = gsc1.fit(x_train,y_train)
#best_params = grid_result.best_params_
#svr1 = SVR(kernel='linear',C=10,epsilon=0.01,cache_size=200,verbose=True,max_iter = -1)        
#svr1 = svr1.fit(x_train,y_train)
#y_pred1 = svr1.fit(x_eval1,y_eval1)
#mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,svr1)
#scr = svr1.score(x_eval1,y_eval1)
#reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])
#print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)   

### 5.2. Using 'polynomial' kernel[](http://)

In [None]:
%%time

#gsc1 = GridSearchCV(estimator=SVR(kernel='poly'),param_grid={'C':[0.1,1,10,100,1000],'epsilon':[10,5,1,0.1,0.5,0.01,0.01,0.005,0.001,0.0001,0.00001],'degree':[2,3,4,5],'coef0':[0.1,0.01,0.001,0.0001]},cv=5,scoring='neg_mean_squared_error',verbose=0,n_jobs=-1)      
#grid_result = gsc1.fit(x_train,y_train)
#best_params = grid_result.best_params_
#svr1 = SVR(kernel='poly',C=1,epsilon=0.01,coef0=0.001,degree = 2,shrinking = True, tol=0.001 ,cache_size=200,verbose=False,max_iter = -1)        
#y_pred1 = svr1.predict(x_eval1,y_eval1)
#mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,svr1)
#scr = svr1.score(x_eval1,y_eval1)
#reg_metrics.append([mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])

#print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)   

### 5.3. Using 'RBF' kernel

In [None]:
%%time

#gsc1 = GridSearchCV(estimator=SVR(kernel='rbf'),param_grid={'C':[0.1,1,10,100,1000],'epsilon':[10,5,1,0.1,0.5,0.01,0.01,0.005,0.001,0.0001,0.0005],'gamma':[5,3,1,0.1,0.01,0.05,0.001,0.005,0.0001]},cv=5,scoring='neg_mean_squared_error',verbose=0,n_jobs=-1)      
#grid_result = gsc1.fit(x_train,y_train)
#best_params = grid_result.best_params_
#svr1 = SVR(kernel='rbf',C=10,epsilon=0.01,gamma=0.01,coef0=0.1,shrinking = True, tol=0.001 ,cache_size=200,verbose=True,max_iter = -1)        
#svr1 = svr1.fit(x_train,y_train)
#y_pred1 = svr1.predict(x_eval1)
#mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,svr1)
#scr = svr1.score(x_eval1,y_eval1)
#reg_metrics.append([mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])
#print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)   

### 6. DECISION TREE REGRESSOR

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train,y_train)
y_pred1 = dtr.predict(x_eval1)
mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,dtr)
scr = dtr.score(x_eval1,y_eval1)
reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])
print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)   

### 7. XGB REGRESSOR

In [None]:
cols = list(set(x_train.columns) - set(['cust_payment_terms', 'business_code']) )

In [None]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(x_train,y_train)
y_pred1 = xgb_model.predict(x_eval1)
mean_abs_error,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance = evaluate_metrics(x_eval1,y_eval1,y_pred1,dtr)
scr = dtr.score(x_eval1,y_eval1)
reg_metrics.append([mean_abs_error,scr,mean_sq_error,root_mean_sq_error,r2_scr,median_abs_score,explained_variance])
print("METRICS  :\n\nMean Absolute Error :  ",mean_abs_error,"\nScore :   ",scr,"\nMean Squared Error :  ",mean_sq_error,"\nRoot Mean Squared Error :  ",root_mean_sq_error,"\nR2 Square :  ",r2_scr,"\nMedian Absolute Score :  ",median_abs_score,"\nExplained Variance Score :  ",explained_variance)   

### Comparision between METRICS

In [None]:
regression_algo = ['1. LINEAR REGRESSION','2. RIDGE REGRESSION','3. LASSO REGRESSION','4. RANDOM FOREST REGRESSION','5. DECISION TREE REGRESSION','6. XBF REGRESSION'] 

metric_names = ["MEAN ABSOLUTE ERROR","SCORE","MEAN SQUARED ERROR","ROOT MEAN SQUARED ERROR","R2 SCORE","MEDIAN ABSOLUTE ERROR","EXPLAINED VARIANCE SCORE"]


In [None]:
comp_metrics = pd.DataFrame(reg_metrics,columns= metric_names, index = regression_algo)

In [None]:
comp_metrics

In [None]:
x_test_info

 ### COMPUTATION OF OUR ESTIMATED PAYMENT DATE ( We are using the '' model --- Better Accuracy as compared to other)

In [None]:
output = pd.DataFrame(columns=['INVOICE CREATE DATE','AMOUNT(IN USD)','INVOICE DUE DATE','INVOICE PAYMENT TERM','INVOICE PAYMENT DATE','DELAY','BUCKET ID'])

In [None]:
output['INVOICE CREATE DATE'] = x_test_info['baseline_create_date']
output['INVOICE DUE DATE'] = x_test_info['due_in_date']
output['AMOUNT(IN USD)'] = x_test_info['total_open_amount_y'].apply(np.exp)

In [None]:
x_test = x_test.dropna()
y_pred = model1.predict(x_test)
y_pred = y_scaler.inverse_transform(np.array(y_pred).reshape(np.array(y_pred).shape[0],1)).reshape(np.array(y_pred).shape[0])
y_pred = np.ceil(np.array(y_pred))

In [None]:
output['INVOICE PAYMENT TERM'] = y_pred
output['INVOICE PAYMENT DATE'] = output['INVOICE CREATE DATE'] + pd.to_timedelta(np.ceil(output['INVOICE PAYMENT TERM']),'D')

In [None]:
output['INVOICE PAYMENT TERM'] 

In [None]:
output['DELAY'] = pd.Series(output['INVOICE PAYMENT TERM'] - output['INVOICE DUE DATE']).dt.days

In [None]:
num_cols

In [None]:
x_test_num = x_test[(set(num_cols) - set(['cust_number']))|set(['cust_number_x'])]

In [None]:
x_test_num.shape

In [None]:
x_test_num = pd.Series(scaler.inverse_transform(np.array(x_test_num)))

In [None]:
output['INVOICE CREATE DATE'] = pd.Datetime(x_test['metric_names'])