In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.neighbors import *
from sklearn.cluster import *
from sklearn.linear_model import *
from sklearn.preprocessing import *
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
seed=999
label='Pawpularity'

np.random.seed(seed)

In [None]:
df=pd.read_csv('../input/swin-transformer-features-for-pawpularity-contest/swin_vectors.csv')

### Data Spliting

In [None]:
df_train,df_test=train_test_split(df,test_size=0.25,random_state=seed,shuffle=False)


x_train,x_test=df_train.drop(['Id',label],axis=1),df_test.drop(['Id',label],axis=1)

y_train,y_test=df_train[label],df_test[label]

# Ridge

In [None]:
alpha=1.

In [None]:
def rmse(y_true,y_pred):
    return np.sqrt(np.mean(np.square(y_true*100-y_pred*100)))

In [None]:
models=[]

scores_train=[]
scores_val=[]

skf=StratifiedKFold(n_splits=5,random_state=seed)
for train_idx,val_idx in skf.split(x_train,y_train):
    
    
    lr=make_pipeline(StandardScaler(),Ridge(alpha=alpha))
    lr.fit(x_train.loc[train_idx],y_train.loc[train_idx]/100.)
    
    models.append(lr)
    
    score=rmse(y_train.loc[train_idx]/100.,lr.predict(x_train.loc[train_idx]))
    scores_train.append(score)
    score=rmse(y_train.loc[val_idx]/100.,lr.predict(x_train.loc[val_idx]))
    scores_val.append(score)
    

In [None]:
print(f'train score : {np.mean(scores_train)} +-{np.std(scores_train)}')
print(f'val score : {np.mean(scores_val)} +-{np.std(scores_val)}')

In [None]:

preds_train=[]
preds_test=[]
for m in models:
    y_pred=m.predict(x_train)
    preds_train.append(y_pred)
    y_pred=m.predict(x_test)
    preds_test.append(y_pred)
    
preds_train=np.mean(preds_train,axis=0)
preds_test=np.mean(preds_test,axis=0)

In [None]:
score_train=rmse(y_train/100.,preds_train)
score_test=rmse(y_test/100.,preds_test)

print(f'train score:{score_train}')
print(f'test score:{score_test}')

In [None]:
def plot_hist(preds_train,preds_test,score_train,score_test):
    fix,ax=plt.subplots(ncols=2,figsize=(20,5))
    ax[0].set_title('Training Set')
    ax[0].set_title(f'Testing Set & Prediction  (rmse:{score_train.astype("float16")})')
    
    df_train[label].hist(ax=ax[0],lw=1,ec="black", fc="red", alpha=0.5,label='GroundTruth')
    ax[0].hist(preds_train*100,lw=1,ec="yellow", fc="green", alpha=0.5,label='Prediction')
    ax[0].set_xlabel(label)
    ax[0].set_ylim([0,4000])
    ax[0].legend()
    
    ax[1].set_title(f'Testing Set & Prediction  (rmse:{score_test.astype("float16")})')
    df_test[label].hist(ax=ax[1],lw=1,ec="black", fc="red", alpha=0.5,label='GroundTruth')
    ax[1].hist(preds_test*100,lw=1,ec="yellow", fc="green", alpha=0.5,label='Prediction')
    ax[1].legend()
    ax[1].set_xlabel(label)
    ax[1].set_ylim([0,1500])
    plt.show()

In [None]:
plot_hist(preds_train,preds_test,score_train,score_test)

# Test polynomial

In [None]:
import tensorflow as tf
import sys
sys.path.append('../input/polynomial/high-order-layers-master')
import high_order_layers.PolynomialLayers as poly
from tensorflow.keras import backend as K

In [None]:
tf.random.set_seed(seed)

## Compare linear regression and polynomial regression

In [None]:
offset = -0.1
factor = 1.5 * 3.14159
xTest = np.arange(100) / 50 - 1.0
yTest = 0.5 * np.cos(factor * (xTest - offset))

xTrain = tf.random.uniform([1000], minval=-1.0, maxval=1, dtype=tf.float32)
yTrain = 0.5 * tf.math.cos(factor * (xTrain - offset))

In [None]:
lr1=LinearRegression()
lr2=make_pipeline(PolynomialFeatures(7),LinearRegression())

In [None]:
lr1.fit(xTrain[:,np.newaxis],yTrain)
lr2.fit(xTrain[:,np.newaxis],yTrain)

In [None]:
y1=lr1.predict(xTest[:,np.newaxis])
y2=lr2.predict(xTest[:,np.newaxis])

In [None]:
fig,ax=plt.subplots(ncols=2,figsize=(10,5))

ax[0].scatter(xTest,y1)
ax[0].scatter(xTest,yTest)
ax[0].set_title('linear regression')

ax[1].scatter(xTest,y2)
ax[1].scatter(xTest,yTest)
ax[1].set_title('polynomial regression')

plt.show()

## TF model

In [None]:
def get_model(x,y,val=False,epochs=5,loss='mse',batch_size=32,order=poly.b3D):
    nn=tf.keras.Sequential([poly.Polynomial(1,order)])
    nn.compile(optimizer='adam',loss=loss,metrics=[loss])
    if val:
        nn.fit(x,y,validation_split=0.25,batch_size=batch_size,epochs=epochs)
    else:
        nn.fit(x,y,batch_size=batch_size,epochs=epochs)
    return nn

* For simple data

In [None]:
nn=get_model(xTrain[:,np.newaxis],yTrain,epochs=10,batch_size=1)

In [None]:
y=nn.predict(xTest[:,np.newaxis])

In [None]:
plt.scatter(xTest,y)
plt.scatter(xTest,yTest)

### Functions

In [None]:
def plot_losses(hist,type_):
    plt.figure(figsize=(10,5))
    for i,h in enumerate(hist):
        plt.plot(h['rmse'],label=f'{i}-order train')

        plt.plot(h['val_rmse'],label=f'{i}-order val')
    
    plt.title(type_,loc='right')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, fancybox=True, shadow=True)
    plt.xlabel('iteration')
    plt.ylabel('rmse')
    plt.ylim([5,25])
    plt.show()

In [None]:
def cross_entropy(y_true,y_pred):
    return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y_true)) #Continuous Bernoulli distribution

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true*100 - tf.nn.sigmoid(y_pred)*100)))

In [None]:
def get_model_complex(x,y,val=False,epochs=5,loss='mse',batch_size=32,order=poly.b2D):
    nn=tf.keras.Sequential([
        tf.keras.layers.Dense(32,activation='relu'),
        poly.Polynomial(1,order),
    ])
    
    nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
               loss=cross_entropy,metrics=[loss])
    if val:
        history=nn.fit(x,y,validation_split=0.25,batch_size=batch_size,epochs=epochs,verbose=0)
    else:
        history=nn.fit(x,y,batch_size=batch_size,epochs=epochs,verbose=0)
    return nn,history

In [None]:
def train(type_):
    hist=[]
    nns=[]
    
    for order in tqdm(order_dict[type_]):
        nn,history=get_model_complex(x_train,y_train/100.,val=True,
                     loss=rmse,batch_size=8,epochs=10,order=order)
        
        hist.append(history.history)
        nns.append(nn)
        
    return nns,hist

In [None]:
def plot_hist_nn(NN):
    for i,nn in enumerate(NN):
        print(f'order-{i+1}')
        preds_train=nn.predict(x_train)

        score_train=rmse(np.array(y_train)/100.,preds_train).numpy()
        
        print(score_train)
        
        preds_test=nn.predict(x_test)

        score_test=rmse(np.array(y_test)/100.,preds_test).numpy()

        plot_hist(tf.nn.sigmoid(preds_train).numpy(),
                  tf.nn.sigmoid(preds_test).numpy(),score_train,score_test)

## Test polynomial order

In [None]:

order_dict={
    'orders_simple':[
        poly.b1,
        poly.b2,
        poly.b3,
        poly.b4,
        poly.b5
    ],
    
    'orders_continuous':[
        poly.b1C,
        poly.b2C,
        poly.b3C,
        poly.b4C,
        poly.b5C
    ],
    'orders_dis_continuous':[
        poly.b1D,
        poly.b2D,
        poly.b3D,
        poly.b4D,
        poly.b5D
    ]}

## Simple polynomial

In [None]:
type_='orders_simple'

In [None]:
simple_nn,simple_hist=train(type_)

In [None]:
plot_losses(simple_hist,type_)

In [None]:
plot_hist_nn(simple_nn)

## Piecewise continuous polynomial

In [None]:
type_='orders_continuous'

In [None]:
con_nn,con_hist=train(type_)

In [None]:
plot_losses(con_hist,type_)

In [None]:
plot_hist_nn(con_nn)

## Piecewise discontinuous polynomial

In [None]:
type_='orders_dis_continuous'

In [None]:
discon_nn,discon_hist=train(type_)

In [None]:
plot_losses(discon_hist,type_)

In [None]:
plot_hist_nn(discon_nn)

## Conclusion :  Just ...  I want to add sample weights on 80~100 data

The way I compute rmse having bug, this not the true rmse score