In [23]:
                                                             # Phase - 1 (Creating the Model)

import pandas as pd                        # pandas is a library used for data manipulation (like handling databases).We name it as 'pd' for easy use.

In [24]:
dataset = pd.read_csv("Salary_Data.csv")   # Reading the dataset and assigning it to a variable (pd is the library and read_csv is the function)

In [25]:
dataset                                    # Executing the variable to view the dataset

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [26]:
# Splitting the dataset as per the requirement (this is the input). The double square brackets are used to store the table in a variable.

independent = dataset[["YearsExperience"]]

In [27]:
# This is the output used to create the model

dependent = dataset[["Salary"]]

In [28]:
# sklearn is a library used for ML, and train_test_split is one of the functions in sklearn

from sklearn.model_selection import train_test_split

In [29]:
# Aligning the split dataset into the train set and test set. test_size denotes the percentage of data to be used for testing, and the remaining data will be used for training.
# X_train (1st position) - input training set
# X_test  (2nd position) - input test set
# Y_train (3rd position) - output training set
# Y_test  (4th position) - output test set

X_train, X_test, Y_train, Y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

                                                            #Learning Phase

In [30]:
# Model Creation (Using the Algorithm). Since we are using Simple Linear Regression for model creation, we import it from the sklearn library &  used to implement supervised and unsupervised Machine learning algorithms.

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()  
# Passing LinearRegression() as a non-parameterised function. 
# This is considered a class, and all the functions inside LinearRegression will be available for use.

regressor.fit(X_train, Y_train)  
# fit is the method used to substitute the dataset for training. 
# Once executed, the model is created and the weight and bias are calculated.

In [31]:
# After the model is created, we need to check the respective findings. In Simple Linear Regression (SLR), the equation is: y = wX + b 
# (where w = weight and b = bias). Now we check these values.

# coef_ (coefficient) – This is a method attribute predefined to return the weight.

weight = regressor.coef_
weight

array([[9360.26128619]])

In [32]:
# intercept_ (intercept) – This is a method attribute predefined to return the bias value.

bias = regressor.intercept_
bias

array([26777.3913412])

In [33]:
# Since the model has been trained successfully, we now need to test it in the testing phase.
# For this, we call the model again and use the predict function to pass the test set data to a new variable.

Y_pred = regressor.predict(X_test)

In [34]:
# Importing the evaluation metrics to identify how well the model has been trained. r2_score is defined as R-squared in the evaluation metrics.
# In the second line, we use r2_score to validate the model by comparing the actual output and the predicted output using this metric.

from sklearn.metrics import r2_score
r_score = r2_score(Y_test, Y_pred)

In [35]:
# If the resulting value is close to 1, the model is considered good. If the value is close to 0, then it is considered a poor model.

r_score

0.9740993407213511

                                                            # How to save the model

In [48]:
import pickle   # pickle is a Python library used to save and load trained models.

# Creating a filename to store the trained SLR model.
# .sav extension is used for saved model files.
filename = "final_model_SLR.sav"


In [49]:
pickle.dump(regressor, open(filename, 'wb'))  
# Saving the trained model using the dump function. 
# 'wb' stands for write-binary mode, which is used for storing the model file.

In [51]:
# Loading the trained model back into a variable so we can pass inputs and check the output.
# 'rb' stands for read-binary mode, used for loading stored model files.

load_model = pickle.load(open("final_model_SLR.sav", 'rb'))

result = load_model.predict([[13]])  
# Passing the input value to the trained model to verify the output.
# Storing the result in a variable.



In [50]:
result    # Calling the variable to check the output for the given input.

array([[148460.78806172]])