In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import random
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

train_id = train["id"]

train.drop("id", axis = 1, inplace = True)
train.head(1)

Machine learning with random forest as a baseline.
Since there is a lot of data and it takes too much time to calculate, we will approach by extracting training data.
（It takes a lot of time to calculate here, so I will extract the data and proceed.）

In [None]:
#RandomForestRegressor
sample_1 = [random.randint(0, len(train)) for i in range(1000)]
sample_2 = [random.randint(0, len(train)) for i in range(2000)]
sample_3 = [random.randint(0, len(train)) for i in range(3000)]

sample = [sample_1, sample_2, sample_3]

fig = plt.figure(figsize = [14,6])
for i in range(len(sample)):
    X, y = train.iloc[sample[i],:-1], train.iloc[sample[i],-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    forest = RandomForestRegressor(random_state = 1, n_jobs = -1)
    forest.fit(X_train, y_train)
    y_train_pred = forest.predict(X_train)
    y_test_pred = forest.predict(X_test)

    print("rmse train:{:.2f} / test:{:.2f}".format(np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_test, y_test_pred))))
    print("r2_score train:{:.2f} / test:{:.2f}".format(r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    fig.add_subplot(1, 3, i+1)
    plt.scatter(y_train, y_train_pred, label = "train")
    plt.scatter(y_test, y_test_pred, label = "test")
    plt.xlabel("raw data")
    plt.ylabel("predict data")
    plt.xlim(0,40)
    plt.ylim(0,40)
    plt.grid()

The large loss value is completely unpredictable.
（Is the score the same as the one with all average values?）

Submit score when learning with all data:8.02

Since the original theme is the default prediction of loans, I would like to predict even those with a lot of losses.
Try another Algols.


In [None]:
#LinearRegression
random.seed(0)
sample_1 = [random.randint(0, len(train)) for i in range(1000)]
sample_2 = [random.randint(0, len(train)) for i in range(2000)]
sample_3 = [random.randint(0, len(train)) for i in range(3000)]

sample = [sample_1, sample_2, sample_3]

pipe_lr = make_pipeline(StandardScaler(),
                        LinearRegression())

fig = plt.figure(figsize = [14,6])
for i in range(len(sample)):
    X, y = train.iloc[sample[i],:-1], train.iloc[sample[i],-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    pipe_lr.fit(X_train, y_train)
    y_train_pred = pipe_lr.predict(X_train)
    y_test_pred = pipe_lr.predict(X_test)

    print("rmse train:{:.2f} / test:{:.2f}".format(np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_test, y_test_pred))))
    print("r2_score train:{:.2f} / test:{:.2f}".format(r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    fig.add_subplot(1, 3, i+1)
    plt.scatter(y_train, y_train_pred, label = "train")
    plt.scatter(y_test, y_test_pred, label = "test")
    plt.xlabel("raw data")
    plt.ylabel("predict data")
    plt.xlim(0,40)
    plt.ylim(0,40)
    plt.grid()

The tendency is the same as randomforestregressor, and the value with large loss cannot be predicted.

Submit score when learning with all data ：7.93

In [None]:
#KneighborsRegressor
from sklearn.neighbors import KNeighborsRegressor

random.seed(0)
sample_1 = [random.randint(0, len(train)) for i in range(1000)]
sample_2 = [random.randint(0, len(train)) for i in range(2000)]
sample_3 = [random.randint(0, len(train)) for i in range(3000)]

sample = [sample_1, sample_2, sample_3]



n = [3,7,20, 50]
k = 0
for j in range(len(n)):
    fig = plt.figure(figsize = [24,18])
    for i in range(len(sample)):
        X, y = train.iloc[sample[i],:-1], train.iloc[sample[i],-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
        pipe_lr = make_pipeline(StandardScaler(),
                                KNeighborsRegressor(n_neighbors = n[j]))
        pipe_lr.fit(X_train, y_train)
        y_train_pred = pipe_lr.predict(X_train)
        y_test_pred = pipe_lr.predict(X_test)
        
        print("neighbor:{:.2f} ".format(n[j]))
        print("rmse train:{:.2f} / test:{:.2f}".format(np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_test, y_test_pred))))
        print("r2_score train:{:.2f} / test:{:.2f}".format(r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
        print("-"*50)

        fig.add_subplot(5, 3, k+1)
        plt.scatter(y_train, y_train_pred, label = "train")
        plt.scatter(y_test, y_test_pred, label = "test")
        plt.xlabel("raw data")
        plt.ylabel("predict data")
        plt.xlim(0,40)
        plt.ylim(0,40)
        plt.grid()
        k+=1

The tendency is the same as randomforestregressor, and the value with large loss cannot be predicted.

Submit score when learning with all data ：


In [None]:
sns.distplot(train["loss"])

Can't predict high loss due to bias in training data? Try to cut loss 0 at 1/2.
Try this idea in randomforestregressor for the time being.

In [None]:
#RandomForestRegressor delete the value of 0 loss at 1/2
tmp_0 = train[train["loss"] ==0]
tmp_1 = train[train["loss"] !=0]

n_len = len(tmp_0) // 2
tmp_0 = tmp_0.iloc[0:n_len]

train_del = pd.concat([tmp_0, tmp_1], axis = 0)
# train_del = tmp_1

#RandomForestRegressor
sample_1 = [random.randint(0, len(train_del)) for i in range(1000)]
sample_2 = [random.randint(0, len(train_del)) for i in range(2000)]
sample_3 = [random.randint(0, len(train_del)) for i in range(3000)]

sample = [sample_1, sample_2, sample_3]

fig = plt.figure(figsize = [14,6])
for i in range(len(sample)):
    X, y = train_del.iloc[sample[i],:-1], train_del.iloc[sample[i],-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    forest = RandomForestRegressor(random_state = 1, n_jobs = -1)
    forest.fit(X_train, y_train)
    y_train_pred = forest.predict(X_train)
    y_test_pred = forest.predict(X_test)

    print("rmse train:{:.2f} / test:{:.2f}".format(np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_test, y_test_pred))))
    print("r2_score train:{:.2f} / test:{:.2f}".format(r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    fig.add_subplot(1, 3, i+1)
    plt.scatter(y_train, y_train_pred, label = "train")
    plt.scatter(y_test, y_test_pred, label = "test")
    plt.xlabel("raw data")
    plt.ylabel("predict data")
    plt.xlim(0,40)
    plt.ylim(0,40)
    plt.grid()

I deleted 0 losses　1/2, but it didn't work well.

Submit score when learning with all data ：8.11

In [None]:
#RandomForestRegressor delete the value of loss range(0 <10)
tmp_10 = train[train["loss"] >=10]

train_del = tmp_10

#RandomForestRegressor
random.seed(0)
sample_1 = [random.randint(0, len(train_del)) for i in range(1000)]
sample_2 = [random.randint(0, len(train_del)) for i in range(2000)]
sample_3 = [random.randint(0, len(train_del)) for i in range(3000)]

sample = [sample_1, sample_2, sample_3]

fig = plt.figure(figsize = [14,6])
for i in range(len(sample)):
    X, y = train_del.iloc[sample[i],:-1], train_del.iloc[sample[i],-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    forest = RandomForestRegressor(random_state = 1, n_jobs = -1)
    forest.fit(X_train, y_train)
    y_train_pred = forest.predict(X_train)
    y_test_pred = forest.predict(X_test)

    print("rmse train:{:.2f} / test:{:.2f}".format(np.sqrt(mean_squared_error(y_train, y_train_pred)), np.sqrt(mean_squared_error(y_test, y_test_pred))))
    print("r2_score train:{:.2f} / test:{:.2f}".format(r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    fig.add_subplot(1, 3, i+1)
    plt.scatter(y_train, y_train_pred, label = "train")
    plt.scatter(y_test, y_test_pred, label = "test")
    plt.xlabel("raw data")
    plt.ylabel("predict data")
    plt.xlim(0,40)
    plt.ylim(0,40)
    plt.grid()

Since the loss of 10 or less has been deleted, naturally there is no predicted value of 10 or less.
Submit score when learning with all data ：13.1
Interestingly, the small value of loss can be predicted before the data is deleted. And, as usual, the large value of loss cannot be predicted.

Looking at the leaderboard, about 7.8 is the best. Should I go to the person who accurately predicts the smaller loss?