# **Introduction**

#### This notebook shows an example of the main steps of building a simple linear regression model in a succinct, straightforward, explained, and easy-to-understand way

# **Importing libraries**

#### Importing all required libraries to run the notebook

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import random

# **Loading the dataset**
#### Loading a dataset suitable for use with a linear regression model (*Salary_Data.csv*)

In [None]:
dataset = pd.read_csv('/kaggle/input/salary-data-simple-linear-regression/Salary_Data.csv')
dataset.head()

# **Splitting the dataset into training and test sets**
#### Splitting the dataset into two sets: one to fit the model (training set), and one to test the model (test set)

In [None]:
input_data = dataset[["YearsExperience"]]
output_data = dataset[["Salary"]]

X_train, X_test, y_train, y_test = train_test_split(
    input_data, 
    output_data, 
    test_size=0.3, 
    random_state=5
)

# **Fitting the model**
#### Fitting the linear regression model parameters with the training set

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# **Setting the model fitting results**
#### Setting the linear regression model parameters slope and y-intercept, and the linear regression model performance evaluation metrics $R^2$, RMSE (Root Mean Square Error) and RSE (Residual Standard Error)

In [None]:
model_slope = model.coef_[0][0]
print("Linear regression model slope: {}".format(round(model_slope, 2)))

model_y_intercept = model.intercept_[0]
print("Linear regression model y-intercept: {}".format(round(model_y_intercept, 2)))

print("Linear regression model performance evaluation:")

model_r2 = model.score(X_test, y_test)
print(" - R\u00b2: {}".format(round(model_r2, 2)))

y_predicted_from_X_test = model_slope * X_test["YearsExperience"].values + model_y_intercept

model_rmse = mean_squared_error(
    y_true=y_test["Salary"].values, 
    y_pred=y_predicted_from_X_test,
    squared=False
)
print(" - RMSE: ${}".format(round(model_rmse, 2)))

model_rse = np.sqrt(
    np.sum(
        (y_test["Salary"].values - y_predicted_from_X_test)**2
    ) / (len(y_test["Salary"])-(len(model.coef_[0]) + 1))
)
print(" - RSE: ${}".format(round(model_rse, 2)))

# **Plotting the model fitting results**
#### Plotting training data, test data, linear regression line, linear regression equation, and model performance evaluation

In [None]:
figure = go.Figure()

linear_function_x_values = np.arange(1, 12)
linear_function_y_values = model_slope * linear_function_x_values + model_y_intercept

figure.add_trace(
    go.Scatter(
        x=linear_function_x_values, 
        y=linear_function_y_values, 
        mode='lines',
        line=dict(
            color='orange', 
            width=3
        ),
        name='Linear regression line'
    )
)

figure.add_trace(
    go.Scatter(
        x=X_train["YearsExperience"], 
        y=y_train["Salary"], 
        mode='markers',
        marker=dict(
            color='limegreen', 
            size=8,
            line_width=1
        ),
        name='Training data'
    )
)

figure.add_trace(
    go.Scatter(
        x=X_test["YearsExperience"], 
        y=y_test["Salary"], 
        mode='markers',
        marker=dict(
            color='red', 
            size=8,
            line_width=1
        ),
        name='Test data'
    )
)

annotation_title = "Model performance evaluation"
annotation_content = "<b>R\u00b2:</b> {}".format(round(model_r2, 2)) \
    + "   <b>RMSE:</b> &#36;{}".format(round(model_rmse, 2)) \
    + "   <b>RSE:</b> &#36;{}".format(round(model_rse, 2))

figure.add_annotation(
    x=0.02,
    y=0.975,
    xref="paper",
    yref="paper",
    text=
        "<span style='font-size: 13px'; 'font-family: Helvetica, Calibri';><b>{}</b></span>".format(
            annotation_title
        ) 
        + "<br><br>" 
        + "<span style='font-size: 9.5px'; 'font-family: Helvetica, Calibri';>{}</span>".format(
            annotation_content
        ),
    font=dict(size=7),
    bordercolor="black",
    borderwidth=1.5,
    borderpad=7.5,
    bgcolor="white",
    showarrow=False,
)

linear_regression_equation = "y = {}x + {}".format(
    round(model_slope, 2),
    round(model_y_intercept, 2)
)

figure.add_annotation(
    x=0.62,
    y=0.52,
    xref="paper",
    yref="paper",
    text="<span style='font-family: Helvetica, Calibri';><b>{}</b></span>".format(
        linear_regression_equation
    ),
    font=dict(
        color="orange", 
        size=12
    ),
    textangle=320,
    showarrow=False,
)

figure.update_layout(
    width=650,
    height=500,
    title=dict(
        text="<b>Linear regression model fitting results</b>",
        font=dict(size=25),
        y=0.92,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
    xaxis=dict(
        title=dict(
            text="Years of experience",
            font=dict(size=15)
        )
    ),
    yaxis=dict(
        title=dict(
            text="Salary ($)",
            font=dict(size=15)
        )
    ),
)

figure.show()

# **Testing the model**
#### Testing the linear regression model by predicting salary using random samples from test set as model inputs, and calculating the percentage error between predicted and actual salaries

In [None]:
def model_test(years_of_experience):
    
    print("Years of experience: {}".format(years_of_experience))
    
    actual_salary = dataset[dataset["YearsExperience"] == years_of_experience]["Salary"].values[0]
    print("Actual salary: ${}".format(actual_salary))
    
    predicted_salary = round(model.predict(np.array([[years_of_experience]]))[0][0], 1)
    print("Predicted salary: ${}".format(predicted_salary))
    
    percentage_error = round(100*(predicted_salary - actual_salary)/actual_salary, 2)
    print("Percentage error between predicted and actual salaries: {}%\n".format(percentage_error))

random_samples_from_X_test = random.sample(
    population=list(X_test["YearsExperience"].values), 
    k=3
)

for random_sample_from_X_test in random_samples_from_X_test:
    model_test(years_of_experience=random_sample_from_X_test)