In [None]:
#IMPORT LIBRARY
import numpy as np 
import pandas as pd 
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import keras 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

sns.set_theme(style="ticks", color_codes=True)
data = pd.read_csv('/kaggle/input/finance-factoring-ibm-late-payment-histories/WA_Fn-UseC_-Accounts-Receivable.csv')

In [None]:
data.info()

In [None]:
data.head(3)

In [None]:
data.describe()

In [None]:
#String to Date
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['DueDate'] = pd.to_datetime(data['DueDate'])
data['PaperlessDate'] = pd.to_datetime(data['PaperlessDate'])

In [None]:
#Due Date is standard 30 days.
Vade = data['DueDate'] - data['InvoiceDate']
print(Vade.value_counts())

In [None]:
#Separation and Normalization
data['PaperlessDateMonth'] = data['PaperlessDate'].dt.month/12
data['PaperlessDateDay'] = data['PaperlessDate'].dt.day/31
data['InvoiceDateMonth'] = data['InvoiceDate'].dt.month/12
data['InvoiceDateDay'] = data['InvoiceDate'].dt.day/31

In [None]:
def numeric_to_categoric (x):
    if x <= 25: return "Less and Equal 25"
    elif x <= 50: return "Between (25-50]"
    elif x <= 75: return "Between (50-75]"
    elif x <= 100: return "Between (75-100]"
    else: return "Greater 100"
data['InvoiceAmount'] = data['InvoiceAmount'].apply(numeric_to_categoric)

In [None]:
data['InvoiceAmount'].value_counts().plot(kind='barh', title='InvoiceAmount')

In [None]:
data_1 = data[data['DaysLate'] != 0]

In [None]:
sns.catplot(x="DaysLate", y="InvoiceAmount",hue="Disputed",
            kind="box", dodge=True, data=data_1)

In [None]:
def numeric_to_categoric(x):
    if x == 0: return "0 days"
    elif x <= 2: return "(0-2] days"
    elif x <= 4: return "(2-4] days"
    elif x <= 6: return "(4-6] days"
    elif x <= 8: return "(6-8] days"
    elif x <= 10: return "(8-10] days"
    else: return "(10-) days"
data['DaysLate'] = data['DaysLate'].apply(numeric_to_categoric)

In [None]:
data['DaysLate'].value_counts().plot(kind='barh', title='DaysLate')

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(25,15))

sns.countplot(x="DaysLate", ax=ax[0][0], data=data.loc[data['PaperlessBill'] == 'Paper'])
ax[0][0].set_title('PaperlessBill is Paper', fontsize=14)
sns.countplot(x="DaysLate", ax=ax[0][1], data=data.loc[data['PaperlessBill'] == 'Electronic'])
ax[0][1].set_title('PaperlessBill is Electronic', fontsize=14)
sns.countplot(x="DaysLate", ax=ax[1][0], data=data.loc[data['Disputed'] == 'Yes'])
ax[1][0].set_title('Disputed is Yes', fontsize=14)
sns.countplot(x="DaysLate", ax=ax[1][1], data=data.loc[data['Disputed'] == 'No'])
ax[1][1].set_title('Disputed is No', fontsize=14)

plt.show()

In [None]:
#Binary Encoding
ce_binary = ce.BinaryEncoder(cols=['countryCode'])
binary_data = ce_binary.fit_transform(data)

In [None]:
#OneHotEncoder
dummy_data = pd.get_dummies(binary_data, columns = ['Disputed', 'PaperlessBill','InvoiceAmount'],drop_first=True)

In [None]:
#Can be deleted
dummy_data['countryCode_0'].value_counts()

In [None]:
#Drop Columns
data = dummy_data.drop(columns=['SettledDate', 'DueDate','InvoiceDate','DaysToSettle','invoiceNumber','PaperlessDate','countryCode_0','customerID'])

In [None]:
#Train Test Split
y = pd.DataFrame(data['DaysLate'])
data.drop(['DaysLate'], inplace=True, axis=1)
X_train,X_test,y_train,y_test = train_test_split(data,y,test_size=0.33, random_state=42)

In [None]:
y_train['DaysLate'].value_counts().plot(kind='barh', title='DaysLate')

In [None]:
y_test['DaysLate'].value_counts().plot(kind='barh', title='DaysLate')

In [None]:
X_train.head(3)

In [None]:
#Logistic Regression Model
logr = LogisticRegression(random_state=0)
logr.fit(X_train,y_train.values.ravel())

y_pred = logr.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy:',acc)

plot_confusion_matrix(logr, X_test, y_test,xticks_rotation='vertical')  
plt.show()

In [None]:
#KNN Model
knn = KNeighborsClassifier(n_neighbors=1, metric='minkowski')
knn.fit(X_train,y_train.values.ravel())

y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy:',acc)

plot_confusion_matrix(knn, X_test, y_test,xticks_rotation='vertical')  
plt.show()

In [None]:
#SVC Model
svc = SVC(kernel='poly')
svc.fit(X_train,y_train.values.ravel())

y_pred = svc.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy:',acc)

plot_confusion_matrix(svc, X_test, y_test,xticks_rotation='vertical')  
plt.show()

In [None]:
#Naive Bayes Model
gnb = GaussianNB()
gnb.fit(X_train, y_train.values.ravel())

y_pred = gnb.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy:',acc)

plot_confusion_matrix(gnb, X_test, y_test,xticks_rotation='vertical')  
plt.show()

In [None]:
#Decision Tree Model
dtc = DecisionTreeClassifier(criterion = 'entropy')
dtc.fit(X_train,y_train.values.ravel())

y_pred = dtc.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy:',acc)

plot_confusion_matrix(dtc, X_test, y_test,xticks_rotation='vertical')  
plt.show()

In [None]:
#Random Forest Model
rfc = RandomForestClassifier(n_estimators=10, criterion = 'entropy')
rfc.fit(X_train,y_train.values.ravel())

y_pred = rfc.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Accuracy:',acc)

plot_confusion_matrix(rfc, X_test, y_test,xticks_rotation='vertical')  
plt.show()

In [None]:
def numeric_to_categoric(x):
    if x == "0 days": return 0
    elif x == "(0-2] days": return 1
    elif x == "(2-4] days": return 2
    elif x == "(4-6] days": return 3
    elif x == "(6-8] days": return 4
    elif x == "(8-10] days": return 5
    else: return 6
y_train['DaysLate'] = y_train['DaysLate'].apply(numeric_to_categoric)
y_test['DaysLate'] = y_test['DaysLate'].apply(numeric_to_categoric)
y_train = pd.get_dummies(y_train, columns = ['DaysLate'])
y_test = pd.get_dummies(y_test, columns = ['DaysLate'])

In [None]:
classifier = Sequential()

classifier.add(Dense(64,kernel_initializer = 'uniform',activation='relu',input_shape = (13,)))

classifier.add(Dropout(0.4))

classifier.add(Dense(32,kernel_initializer = 'uniform',activation='relu'))

classifier.add(Dense(16,kernel_initializer = 'uniform',activation='relu'))

classifier.add(Dense(7,kernel_initializer = 'uniform',activation='sigmoid'))

classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = classifier.fit(X_train,y_train,epochs=100)

y_pred=classifier.predict(X_test)

y_pred =(y_pred>0.5)


In [None]:
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()