Problem Statement

The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with calculating the loss associated with a loan defaults. Although the features are anonymized, they have properties relating to real-world features.

Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
train

In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
test

In [None]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv")
submission

Analyse target

In [None]:
sns.displot(train['loss']);

In [None]:
train['loss'].describe()

In [None]:
target = train.loss
train.drop(['loss'], axis=1, inplace=True)
train

Combine

In [None]:
combi = train.append(test)
combi

In [None]:
combi.drop(['id'], axis=1, inplace=True)
combi

Check for null values

In [None]:
combi.isnull().sum().sum()

Normalise

In [None]:
combi = (combi - combi.min()) / (combi.max() - combi.min())
combi.shape

Define X and y

In [None]:
length = len(train)

y = target.ravel()
X = combi[: length]
X_test = combi[length :]
y.shape, X.shape, X_test.shape

Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape

CCA

In [None]:
from sklearn.cross_decomposition import CCA

cca = CCA(n_components=10, max_iter=20000).fit(X_train, y_train)

X_train = cca.transform(X_train)
X_val = cca.transform(X_val)
X_test = cca.transform(X_test)

X_train.shape, X_val.shape, X_test.shape

Select Model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_train, y_train)
print("Score: ",model.score(X_train, y_train))
print("")
print("Coefgficient: ", model.coef_)
print("")
print("Intercept: ", model.intercept_)

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))
print("")
print("Coefgficient: ", model.coef_)
print("")
print("Intercept: ", model.intercept_)

Evaluate

In [None]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

Compare

In [None]:
compare = pd.DataFrame({'actual': y_val, 'predicted': y_pred})
print(compare)

Graph

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_val, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_val))
p2 = min(min(y_pred), min(y_val))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('Actual Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


Predict on test set

In [None]:
prediction = model.predict(X_test)
prediction[prediction < 0] = 0
prediction.shape

Prepare submission

In [None]:
submission.loss = prediction
submission

In [None]:
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission
