In [334]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

### Reading the data

In [306]:
path = "/kaggle/input/open-items/Dataset_Open_Items_EN.csv"
df = pd.read_csv(path, sep = ';')
# Load the file, change the path by specifying your file's location 

In [307]:
df.columns # check the columns of the dataset

In [308]:
df.head(10)

In [309]:
df.shape # Will get the shape of the data

In [310]:
df.info() # So, there seems to be one column(target) which is object and others are float

### Total number of null values

In [311]:
df.isnull().sum().sum() #Total number  of null values in the dataset

### Summary of whole dataset

In [312]:
Summary = pd.DataFrame(df.dtypes, columns=['Dtype'])
Summary["max"] = df.max()
Summary["min"] = df.min()
Summary["Null"] = df.isnull().sum() # to get null values
Summary["First"] = df.iloc[0] # to get first value
Summary["Second"] = df.iloc[1] # to get second value
Summary

Since we can see from the Summary of the dataset that data is not normalized and neither scaled, since the max values varies so data needs to be scaled and we do not have null values.

### Feature Engineering

In [313]:
#df["mean"] = df.mean(axis = 1)
#df["med"] = df.median(axis = 1)
#df["std"] = df.std(axis = 1)
#df["skew"] = df.skew(axis = 1)

### Creating the test data

In [314]:
test_df = df[39001:]
df = df[0:39000]

### Getting the target data

In [315]:
y = df["Payment_delay"]
df = df.drop(["Payment_delay"], axis=1)
y_test = test_df["Payment_delay"]
test_df = test_df.drop(["Payment_delay"], axis=1)

Since the target data is categorical with 4 different categories in it, so it is a classification problem and we can use f1 score, confusion matrix or log lossas a metric to see how well the model is doing on the data. 
Although Xgboost performs well in most of the problems, we can start with that and then also use the combination of logistic regression and xgboost to see how the result changes and if there is imporvement. 

In [316]:
y.value_counts().plot(kind='bar', color = ["pink", "green", "yellow", "red", "black"])

We can see that from that the plot the distribution of data is not normal and we have more values when the debtor pays on time and it is very unlikely that debtor takes more than 90 days to pay the bills

### Getting the min and max value in whole dataset

In [317]:
print(df.min().min()) # Min value in the whole dataset
print(df.max().max()) # Max value in the whole dataset

### Getting the correltion of the data

In [318]:
import matplotlib.pyplot as plt
import seaborn as sns

cor = df.corr()
#f, ax = plt.subplots(figsize=(20, 20))
#sns.heatmap(cor, vmax=.8, square=True, annot= True);
# See how the features are correlated so that we can remove the highest correalted features because they are redundant

Just to see if the data columns are correlated, if they are then the data is mostly just redundant and we can get rid of some the data columns if they are redundant, So these ['N_items_-reminded', 'number_receipts_PoL'] features are highly correlated

# Drop the high correlated features

In [319]:
df = df.drop(to_drop, axis=1)
test_df = test_df.drop(to_drop, axis = 1)

### Transforming the target data

In [320]:
print(y.unique())

In [321]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()
y =le.fit_transform(y)
y_test = le.fit_transform(y_test)
print(y[0:100]) # 1-30 is one
#list(le.inverse_transform(['0', '1-30', '31-60', '61-90', '>90']))

### Splitting the data in training and validation set

In [322]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(df, y, stratify = y, random_state = 123, test_size = 0.2)

In [323]:
print(len(X_train))
print(len(X_val))

### Scaling the data

In [324]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_df = scaler.transform(test_df)

In [325]:
from sklearn.metrics import confusion_matrix, log_loss, f1_score

### Logistic Regression Model

In [326]:

lr = LogisticRegression(random_state=123, C = 0.01, penalty = 'l2')
lr.fit(X_train, y_train)
y_pred_l =lr.predict_proba(X_val)

print(log_loss(y_val, y_pred_l)) #0.6949971643614415 --> correlated feature drop 0.6950228858317289, 0.693083752672263, 0.682 with less data

y_pred = lr.predict(X_val)
f1score = f1_score(y_val, y_pred, average=None)
mat = confusion_matrix(y_val, y_pred)
print(f1score, mat)

### Random Forest Model

In [327]:

from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier(max_features= 7, random_state= 0, n_estimators = 150)
rc.fit(X_train, y_train)
y_pred_l=rc.predict_proba(X_val)

loss = log_loss(y_val, y_pred_l)
print(loss) #0.6846230417333001 --> correlated feature drop --> 0.6747265541757361, 0.62 with the removal of last 1000 data

y_pred = rc.predict(X_val)
f1score = f1_score(y_val, y_pred, average=None)
mat = confusion_matrix(y_val, y_pred)
print(f1score, mat)

### XgBoost Model

In [328]:
from xgboost import XGBClassifier
#rc = RandomForestClassifier(max_features= 7, random_state= 0, n_estimators = 150)
xg = XGBClassifier(n_estimators = 150, learning_rate = 0.1, random_state = 0)
xg.fit(X_train, y_train)
y_pred_l=xg.predict_proba(X_val)

loss = log_loss(y_val, y_pred_l)
print(loss) # 0.5901140046698274 --> correlated feature drop 0.5913514065110358 -->pretty bad results with less learning rate, 0.596 with feature engineering, without feature engineering, 0.5799 which is the best score till now

y_pred = xg.predict(X_val)
f1score = f1_score(y_val, y_pred, average=None)
mat = confusion_matrix(y_val, y_pred)
print(f1score, mat) 

The smaller the log loss the better the results so this shows that, xgboost perfomed better than other models
0.5799

### Taking the best classifier to use it on test data

In [329]:
xg.classes_
# Get all the classes 

In [330]:
cols = le.inverse_transform(xg.classes_)
cols
# Get all the orginal columns using inverse transform of label encoder

In [331]:
res = xg.predict_proba(test_df)
# Predicting the resultant data using the best classifier that is XgBoost for this dataset

In [332]:
res_df = pd.DataFrame(xg.predict_proba(test_df), columns=cols)

In [333]:
res_df = res_df>0.5
res_df