<a href="https://colab.research.google.com/github/rsemihkoca/HW1/blob/main/Trendyol_%C3%B6dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Data**

---




In [None]:
import plotly.io as pio
pio.renderers.default = "colab"
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import make_regression
import plotly.express as px
import seaborn as sns

In [None]:
cal_housing = fetch_california_housing()
X = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
y = cal_housing.target
Data = pd.DataFrame(dict(MedInc=X['MedInc'], Price=cal_housing.target))

In [None]:
X=Data["MedInc"].to_numpy()
y=Data["Price"].to_numpy()



---



# **ScatterPlot of Data**

---



In [None]:
fig = px.scatter(Data, x="MedInc", y="Price")
fig.show()

---


# **Loss Function**

---



$$
 \hat{y} = \beta_0 + \beta_1 x = {\beta} ^T. X $$

 $$
 L(\beta_0,\beta_1) = \left\{ \begin{array}{cl}
(y_i - \hat{y}_i)^2 & : \ (y_i - \hat{y}_i)^2 \le  \theta \\
\theta & : \ otherwise
\end{array} \right .  $$

###  Loss function that described in the case is a function in piecewise type, when loss function is written, it can be seen that a function similar to the huber loss is obtained as above. Although this loss function returns an equal amount of errors after a certain value:
### *> No derivative in threshold value.*
### *> After threshold value, gradient is equal to 0. Causes **calculation burden** a lot as beta values are random and **wont update**.*

---









# **Defining New Loss Function and Convexity Check**

---


$$
\Large
L(\beta_0,\beta_1,{\theta}) =\ {\theta}\cdot\left(1-\mathrm{e}^{-\frac{\left(y-\beta_0 - \beta_1 x\right)^2}{{\theta}}}\right) 
$$

###  Found an exponentially decreasing function as the loss function and modified it with theta so that errors after theta converge to theta.


In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

def loss_func(y,x,b_0,b_1,theta):
    y_pred=b_0+b_1*x
    err=theta*(1-np.power(np.e,(-(1/theta)*(np.power((y-y_pred).mean(),2)))))
    return err
    
custom_loss=[]
actual_loss=[]
theta=5
for i, K in enumerate(np.linspace(-7, 7, 1000)):
    err=theta*(1-np.power(np.e,(-(1/theta)*(np.power((K),2)))))
    actual_loss.append(K)
    custom_loss.append(err)
l = pd.DataFrame(dict(actual_loss=actual_loss, custom_loss=custom_loss))
fig = px.scatter(l, x="actual_loss", y="custom_loss",width=1000, height=600,color=custom_loss,)
fig.update_xaxes(showline=True, linewidth=3, linecolor='black', mirror=True,showgrid=True, gridwidth=1.1, gridcolor='gray',zeroline=True, zerolinewidth=1.4, zerolinecolor='black')
fig.update_yaxes(showline=True, linewidth=3, linecolor='black', mirror=True,showgrid=True, gridwidth=1.1, gridcolor='gray',zeroline=True, zerolinewidth=1.4, zerolinecolor='black')
fig.update_layout(plot_bgcolor="white",yaxis = dict(range=[-4,12],tickfont = dict(size=20)),xaxis = dict(range=[-10,10],tickfont = dict(size=20)))
config = dict({'scrollZoom': True,'displaylogo':False})
fig.show(config=config)


###  As can be seen from the output above, the function is in a convex structure. And after threshold errors converges to that threshold.

---




# **Gradient Descent Update Calculation ($\nabla $)**

---



$
\Large
L(\beta_0,\beta_1,{\theta}) =\ {\theta}\cdot\left(1-\mathrm{e}^{-\frac{\left(y-\beta_0 - \beta_1 x\right)^2}{{\theta}}}\right) 
$

$$ \Large \frac{\partial L}{\partial \beta_0} = -2\left(y-b_0-b_1x\right)\mathrm{e}^{-\frac{\left(y-b_0-b_1x\right)^2}{{\theta}}} \hspace{1cm} \text{and}
\hspace{1cm} \frac{\partial L}{\partial \beta_1} = -2x\cdot\left(y-b_0-b_1x\right)\mathrm{e}^{-\frac{\left(y-b_0-b_1x\right)^2}{{\theta}}} $$ 

### Model organized by update functions:

In [None]:
def custom_loss_model(x, y): # -> np.ndarray:
    theta=5
    beta = np.random.random(2)
    alpha=1.2*10**-2

    for i in range(1000000):
        y_pred= beta[0] + beta[1] * x    
       
        # Update Functions:
        g_b0 = -2 * ((y - y_pred).mean()) * np.power(np.e,(-1*np.power(((y - y_pred).mean()),2)/theta)) 
        g_b1 = -2 * ((x * (y - y_pred)).mean()) * np.power(np.e,(-1*np.power(((y - y_pred).mean()),2)/theta)) 

        beta_prev = np.copy(beta)


        beta[0] = beta[0] - alpha * g_b0
        beta[1] = beta[1] - alpha * g_b1
        
        if(i%1000 == 0):
            print(f"iteration: ({i}) beta: {beta}, gradient: {g_b0} {g_b1}")
            
        if np.linalg.norm(beta - beta_prev) < 0.00000001:
            print(f"I do early stoping at iteration {i}")
            break


    return beta

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
beta=custom_loss_model(X, y)
beta

### Minimized beta values:
### **$ \beta_0$: 0.45085577**
### **$ \beta_1$: 0.41793849**

---




# **L2 Regularized Version ($\nabla,\lambda$)**

---

$
\Large
L(\beta_0,\beta_1,{\theta}) =\ {\theta}\cdot\left(1-\mathrm{e}^{-\frac{\left(y-\beta_0 - \beta_1 x\right)^2}{{\theta}}}\right) 
$

$ \Large \frac{\partial L}{\partial \beta_0} = -2\left(y-b_0-b_1x\right)\mathrm{e}^{-\frac{\left(y-b_0-b_1x\right)^2}{{\theta}}}+ 2 \lambda\beta_0$  $\Large \frac{\partial L}{\partial \beta_1} = -2x\cdot\left(y-b_0-b_1x\right)\mathrm{e}^{-\frac{\left(y-b_0-b_1x\right)^2}{{\theta}}}+ 2\lambda\beta_1 $ 

In [None]:
def l2r_custom_loss_model(x, y,lam=0.001,alpha=1.2*10**-2): # -> np.ndarray:
    theta=6
    
    beta = np.random.random(2)
    
    for i in range(10000):
        y_pred= beta[0] + beta[1] * x    
        
        g_b0 = -2 * ((y - y_pred).mean()) * np.power(np.e,(-1*np.power(((y - y_pred).mean()),2)/theta)) + 2 * lam * beta[0]
        g_b1 = -2 * ((x * (y - y_pred)).mean()) * np.power(np.e,(-1*np.power(((y - y_pred).mean()),2)/theta)) + 2 * lam * beta[1]
        beta_prev = np.copy(beta)


        beta[0] = beta[0] - alpha * g_b0
        beta[1] = beta[1] - alpha * g_b1
        
        if(i%1000 == 0):
            print(f"iteration: ({i}) beta: {beta}, gradient: {g_b0} {g_b1}")
            
        if np.linalg.norm(beta - beta_prev) < 0.000001:
            print(f"I do early stoping at iteration {i}")
            break


    return beta

In [None]:
beta_you_do=l2r_custom_loss_model(X, y, 0.001, 0.0017)
beta_you_do

iteration: (0) beta: [0.73205179 0.30567685], gradient: -0.33272991947803393 -2.1254460007480542
iteration: (1000) beta: [0.60453951 0.38559199], gradient: 0.05820051342101118 -0.012239716790823382
iteration: (2000) beta: [0.53134657 0.40098465], gradient: 0.03081731351302142 -0.006480959130434283
iteration: (3000) beta: [0.49259299 0.40913463], gradient: 0.016316123950722173 -0.0034313220194174356
iteration: (4000) beta: [0.47207536 0.41344954], gradient: 0.008638268089837284 -0.0018166495116849945
iteration: (5000) beta: [0.46121273 0.41573397], gradient: 0.004573334820893285 -0.0009617838031226327
iteration: (6000) beta: [0.45546177 0.41694342], gradient: 0.0024212434288600794 -0.0005091935720200252
iteration: (7000) beta: [0.45241706 0.41758373], gradient: 0.0012818692006833687 -0.0002695803100354869
iteration: (8000) beta: [0.45080511 0.41792272], gradient: 0.0006786548191218473 -0.00014272281135937473
I do early stoping at iteration 8259


array([0.4505297 , 0.41798064])

### Minimized beta values for L2 regularized model:
### **$ \beta_0$: 0.45052928**
### **$ \beta_1$: 0.41798073**

---


In [None]:
def model2(x, y, lam, alpha=0.0001) -> np.ndarray:
    print("starting sgd")
    beta = np.random.random(2)

    for i in range(1000):
        y_pred: np.ndarray = beta[0] + beta[1] * x

        g_b0 = -2 * (y - y_pred).sum() + 2 * lam * beta[0]
        g_b1 = -2 * (x * (y - y_pred)).sum() + 2 * lam * beta[1]
        if(i%100 == 0):
            print(f"({i}) beta: {beta}, gradient: {g_b0} {g_b1}")
          

        beta_prev = np.copy(beta)

        beta[0] = beta[0] - alpha * g_b0
        beta[1] = beta[1] - alpha * g_b1

        if np.linalg.norm(beta - beta_prev) < 0.000001:
            print(f"I do early stoping at iteration {i}")
            break

    return beta

In [None]:
beta_we_do=model2(X, y, 0.001, 0.000001)
beta_we_do

starting sgd
(0) beta: [0.22991891 0.88084683], gradient: 64843.82279935203 319955.7891192779
(100) beta: [0.30982797 0.44759699], gradient: -1082.7534344453609 227.70576432239915
(200) beta: [0.38560512 0.43166086], gradient: -500.96724535513005 105.35466882404704
(300) beta: [0.42066562 0.42428755], gradient: -231.7870097980958 48.745389806240716
(400) beta: [0.43688737 0.42087608], gradient: -107.24297528285543 22.553466817241333
(500) beta: [0.44439284 0.41929766], gradient: -49.619069496339456 10.43501482903797
(600) beta: [0.44786546 0.41856736], gradient: -22.957700037587905 4.8280619279028585
(700) beta: [0.44947216 0.41822947], gradient: -10.622045039648722 2.2338427267984446
(800) beta: [0.45021555 0.41807313], gradient: -4.914596873370354 1.0335520551188129
(900) beta: [0.45055951 0.4180008 ], gradient: -2.273880626334731 0.4782028017814714


array([0.45071864, 0.41796733])

### Minimized beta values for L2 regularized model:
### **$ \beta_0$: 0.45098132**
### **$ \beta_1$: 0.41791209**

---


# **WeDo vs YouDo**

---

In [64]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.scatter(Data, x="MedInc", y="Price")

y_pred_we_do=beta_we_do[0]+beta_we_do[1]*X
y_pred_you_do=beta_you_do[0]+beta_you_do[1]*X

l =pd.DataFrame(dict(x=X,y_pred_we_do=y_pred_we_do,y_pred_you_do=y_pred_you_do))

fig1 = px.line(l, x="x", y="y_pred_we_do")
fig1.update_traces(line=dict(color = 'green',width=6),showlegend=True,name="WeDo")

fig2 = px.line(l, x="x", y="y_pred_you_do")
fig2.update_traces(line=dict(color = 'red',width=4),showlegend=True,name="YouDo")

layout = go.Layout(title=dict(
        text='<b>Click on legends to see the lines seperately</b>',
        x=0.5,
        y=0.95,
        font=dict(
            family="Arial",
            size=20,
            color='#000000'
        )
    ),yaxis = dict(title='Price',titlefont = dict(size = 25),tickfont = dict(size=20)),xaxis = dict(title='MedInc',titlefont = dict(size = 25),tickfont = dict(size=20)),
                   showlegend=True,
                   legend=dict(
                    orientation="h",
                    traceorder="normal",
                    font=dict(
                            family="sans-serif",
                            size=20,
                            color="black"
                        ),
                    yanchor="bottom",
                    y=0.2,
                    xanchor="right",
                    x=0.9 ))


fig3 = go.Figure(data=fig.data + fig1.data+ fig2.data, layout = layout)
fig3.show()


In [50]:
from sklearn.metrics import mean_squared_error

print(f"MSE of WeDo: {mean_squared_error(y, l.y_pred_we_do)},MSE of YouDo: {mean_squared_error(y, l.y_pred_you_do)}")

MSE of WeDo: 0.7011311539446824,MSE of YouDo: 0.7011311832474091


### **Both models give the same output** for almost the same lambda values. however, the number of iterations and learning rate in the model in Youdo is higher than the model in Wedo. When comparing MSE values, the values are almost the same.