In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Importing dataset for processing

dataset = pd.read_csv('D:\Trainer\DSc ML\Datasets\Linear Reg\Salary_Data.csv')

In [4]:
dataset.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [7]:
dataset.isnull().sum() # No missing values

YearsExperience    0
Salary             0
dtype: int64

In [9]:
# Info

dataset.info()

# This is a Simple Linear Regression Problem as there is only one Independent Variable(IDV)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 544.0 bytes


### Split the data into X and Y

In [10]:
# x - independent variable
# y - dependent variable

x = dataset.iloc[:, 0]
x

0      1.1
1      1.3
2      1.5
3      2.0
4      2.2
5      2.9
6      3.0
7      3.2
8      3.2
9      3.7
10     3.9
11     4.0
12     4.0
13     4.1
14     4.5
15     4.9
16     5.1
17     5.3
18     5.9
19     6.0
20     6.8
21     7.1
22     7.9
23     8.2
24     8.7
25     9.0
26     9.5
27     9.6
28    10.3
29    10.5
Name: YearsExperience, dtype: float64

In [65]:
# Either do this and later change the shape of x_train and x_test
#x = dataset.iloc[:, 0].values # will give an array
# x

# or do this
x = dataset.iloc[:, :-1].values # will give an array
x

array([[ 1.1],
       [ 1.3],
       [ 1.5],
       [ 2. ],
       [ 2.2],
       [ 2.9],
       [ 3. ],
       [ 3.2],
       [ 3.2],
       [ 3.7],
       [ 3.9],
       [ 4. ],
       [ 4. ],
       [ 4.1],
       [ 4.5],
       [ 4.9],
       [ 5.1],
       [ 5.3],
       [ 5.9],
       [ 6. ],
       [ 6.8],
       [ 7.1],
       [ 7.9],
       [ 8.2],
       [ 8.7],
       [ 9. ],
       [ 9.5],
       [ 9.6],
       [10.3],
       [10.5]])

In [66]:
# Convert into array as we need numbers to build regression model
# Either do this and later change the shape of y_train and y_test
y = dataset.iloc[:, 1].values
y

# or do this
# y = dataset.iloc[:, -1: ].values # will give an array
# y

array([ 39343.,  46205.,  37731.,  43525.,  39891.,  56642.,  60150.,
        54445.,  64445.,  57189.,  63218.,  55794.,  56957.,  57081.,
        61111.,  67938.,  66029.,  83088.,  81363.,  93940.,  91738.,
        98273., 101302., 113812., 109431., 105582., 116969., 112635.,
       122391., 121872.])

### Split the dataset into training and test dataset
#### Range: 
#### Training - (70% - 80% )
#### Test: (20% - 30%)

In [46]:
# Splitting by 75% & 25% data respectively for training and test
#sklearn(scikit) is a ML library

from sklearn.model_selection import train_test_split

# Training is for building a model
# Test is for predicting DV value and model validation

In [67]:
# x_train, y_test, y_train, y_test = train_test_split(IDV, DV, test_size = <fraction decided to split>, random_state = 0)
# random_state: Fixing the value
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [68]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(22, 1)
(8, 1)
(22,)
(8,)


## Linear Regression Model

In [69]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [32]:
# Changing the shape of x_train and y_train into a unknown row and single column array
# x_train = x_train.reshape(-1, 1)
# y_train = y_train.reshape(-1, 1)

In [70]:
x_train

array([[ 5.3],
       [ 7.9],
       [ 2.9],
       [ 5.1],
       [ 3.2],
       [ 4.5],
       [ 8.2],
       [ 6.8],
       [ 1.3],
       [10.5],
       [ 3. ],
       [ 2.2],
       [ 5.9],
       [ 6. ],
       [ 3.7],
       [ 3.2],
       [ 9. ],
       [ 2. ],
       [ 1.1],
       [ 7.1],
       [ 4.9],
       [ 4. ]])

In [71]:
y_train

array([ 83088., 101302.,  56642.,  66029.,  64445.,  61111., 113812.,
        91738.,  46205., 121872.,  60150.,  39891.,  81363.,  93940.,
        57189.,  54445., 105582.,  43525.,  39343.,  98273.,  67938.,
        56957.])

In [72]:
regressor.fit(x_train, y_train)

LinearRegression()

In [73]:
y_pred = regressor.predict(x_test)

In [74]:
y_pred

array([ 41056.25705466, 123597.70938378,  65443.50433372,  63567.56223533,
       116093.94099022, 108590.17259667, 117031.91203942,  64505.53328452])

In [75]:
y_test

array([ 37731., 122391.,  57081.,  63218., 116969., 109431., 112635.,
        55794.])

In [82]:
 regressor.coef_

array([9379.71049195])

In [77]:
regressor.intercept_

26986.691316737255

In [78]:
# salary of a person having 17 years of experience?

y =  regressor.coef_ * 17 + regressor.intercept_

In [81]:
print('Salary:', y)

Salary: [186441.76967982]


 ### Accuracy: R-Square

In [83]:
from sklearn.metrics import r2_score

In [84]:
r2_score(y_test, y_pred)

0.9779208335417601

#### Accuracy is 97.7%.