<img src=https://lstms-brainybits.s3.ap-south-1.amazonaws.com/green+logo.png width="300" height="200" style="float: left; margin-right: 8px;">

We believe in empowering individuals with the
knowledge and skills they need to become lifelong
learners. Our self-learning website offers a diverse
range of high-quality, interactive courses.
**All rights reserved to BrainyBits. Unauthorized reproduction or distribution of this content is prohibited.** 

# Simple Linear Regression for Predicting Salary

### 1. Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore") 

### 2) Reading dataset

In [None]:
df = pd.read_csv("Salary_Data.csv")

### 3) Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.sample(10)   #Random Sample of 10 rows 

In [None]:
df.shape        # Rows and Columns of the Dataset. 

In [None]:
#Data type of the Data

df.info()

In [None]:
df.describe()

In [None]:
df.describe().T

In [None]:
# Check for Outliers

fig, axs = plt.subplots(1, figsize = (5,5))
plt1 = sns.boxplot(df['YearsExperience'])
plt.tight_layout()

In [None]:
# Checking the Null Values in the Data. 

df.isnull().sum()

In [None]:
# Checking zero values in the data. 

(df == 0).sum()   

In [None]:
df.Salary.value_counts()

In [None]:
df.Salary.value_counts().sum()     #Total Unique Values in the Data. 

In [None]:
# Correlation of input and output columns

corrmat = df.corr()                        #pariwise correlation of all the columns 
top_corr_features = corrmat.index                    
plt.figure(figsize = (5,5))

#heatmap of the data

g = sns.heatmap(df[corrmat.index].corr(),annot = True, cmap="RdYlGn") # annot present values in the blocks. cmap = colormap 

# g = sns.heatmap(df[top_corr_features].corr(),annot = True, cmap="RdYlGn") 

In [None]:
df.corr()   #Correlation between Variables.

In [None]:
import seaborn as sns
sns.pairplot(df)

In [None]:
# correlation with scatter plot

X = df['YearsExperience']
y = df['Salary']  

plt.figure(figsize=(6,6))

plt.scatter(X, y, label = "label_name" )

# Set x and y axes labels
plt.xlabel('YearsExperience')
plt.ylabel('Salary')

plt.title('Scatter Plot')

plt.show()

### 4) Model Building

## Simple Linear Regression Model using Classic methods (manual calculation)

In [None]:
df1 = pd.DataFrame({'YearsExperience as X': df['YearsExperience'], 'Salary as Y ':df['Salary'] })   
df1.head(10)

In [None]:
# Calculating the mean of X and Y 

mean_x = df['YearsExperience'].sum() / df['YearsExperience'].count()  
round(mean_x,2) 

mean_y = df['Salary'].sum() / df['Salary'].count()   
round(mean_y,2) 

print(round(mean_x,2) , round(mean_y,2))

In [None]:
X = df['YearsExperience'].values          
Y = df['Salary'].values 
n = len(X)


# Using the formula to calculate m and c

numer = 0
denom = 0
for i in range(n):
    numer += (X[i] - mean_x) * (Y[i] - mean_y)
    denom += (X[i] - mean_x) ** 2
    m = numer / denom
    c = mean_y - (m * mean_x)

In [None]:
Y = m * 1.3 + c  

print(m)
print(c)
print(Y)

## Simple Linear Regression Model using OLS method

In [None]:
#  OLS = ordinary least square method. 

import statsmodels.api as sm
X2 = sm.add_constant(X)
est = sm.OLS(y,X2)                                   
est2 = est.fit()
est2.summary()

## Simple Linear Regression model using sklearn

In [None]:
import sklearn.linear_model as skl_lm
from sklearn.linear_model import LinearRegression

regr = skl_lm.LinearRegression()
X = df.YearsExperience.values.reshape(-1,1)
y = df.Salary

regr.fit(X,y)

In [None]:
regr.intercept_

In [None]:
regr.coef_

In [None]:
regr.score(X, y)

In [None]:
# Accuracy Score of the Model.

(regr.score(X,y) ) * 100             

## Observation

### We can see that Our values from the Classical Method and Sklearn Model are same.

In [None]:
# Now We can Make Prediction. 

regr.predict([[1.1]])

In [None]:
list1 = []

for x in df['YearsExperience']:
    print(regr.predict([[x]]))   
    list1.append(regr.predict([[x]]))

In [None]:
from itertools import chain
flatten_list = list(chain.from_iterable(list1))
str(flatten_list)
results = list(map(float, flatten_list))
results

In [None]:
df1 = pd.DataFrame({'Actual': df['Salary'], 'Predicted': results, 'Difference': round( (df['Salary']-results),2 ) }) 
df1.head(10)


### 5) Model Evaluation with Regression Metrics

#### 1) Sum of residuals

In [None]:
#Sum of Residuals 

round( (df['Salary']-results),2 ).sum() 

#### 2) Square Sum of Residual errors

In [None]:
# Square Sum of Residual errors are : 
SRS = round( (df['Salary']-results),2 )**2    
SRS
SRS.sum()

#### 3) R2 Value

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
Sales_pred = regr.predict(X)                       # Value of Target variable when we predict it with the Independent variable. 
r2_score(df['Salary'], results) 

#### 4) Regression plot

In [None]:
# Plotting the actual line
sns.regplot(x=df['YearsExperience'], y=df['Salary'], order=1, ci=None, scatter_kws={'color':'r', 's':9})

# Plotting the predicted line
sns.regplot(x=df['YearsExperience'], y=results, order=1, ci=None, scatter_kws={'color':'g', 's':9})

plt.xlim(2, 10)
plt.ylim(bottom=30000)

plt.show()