##### Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##### Load the Data and check the records

In [None]:
data = pd.read_csv('../input/salary-data-simple-linear-regression/Salary_Data.csv')
data.head()

##### Check what your data has

In [None]:
data.info()

##### Perform statistical Analysis on Numerical Data

In [None]:
data.describe()

##### Check for Null Values

In [None]:
data.isnull().sum()

##### Check for Outliers

In [None]:
sns.boxplot(data['Salary'])

##### As we see no outliers, check the distribution of the data.
##### One way to verify if your data is following Gaussian distribution, check for skewness
##### If the value of skewness lies between -0.5 to +0.5, then we assume the data is normally distributed

In [None]:
print(data['YearsExperience'].skew())
data['YearsExperience'].plot(kind='kde')

In [None]:
print(data['Salary'].skew())
data['Salary'].plot(kind='kde')

In [None]:
# Check for Correlation
data.corr()

In [None]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
import statsmodels.api as sm

In [None]:
sm.graphics.plot_corr(corr, xnames=list(corr.columns))
plt.show()

##### Plotting the relationship

In [None]:
sns.scatterplot(data['YearsExperience'], data['Salary'])

In [None]:
data.describe()

In [None]:
coeff_of_var = data.std()/data.mean()
coeff_of_var

#### OLS (Ordinary Least Squares)

In [None]:
x = data['YearsExperience']
y = data['Salary']

##### The general equation is y=b1x+b0, Where b1 = slope and b0 = Intercept

In [None]:
b1 = np.sum((x-x.mean())*(y-y.mean()))/np.sum((x-x.mean())**2)
b1

In [None]:
b0 = y.mean()-b1*x.mean()
b0

In [None]:
ypred = b1*x+b0

##### Residue is sum of errors

In [None]:
residue = y-ypred

##### Sum of Squared Error

In [None]:
sse = np.sum((y-ypred)**2)
sse

##### Mean Squared Error

In [None]:
mse = np.mean((y-ypred)**2)
mse

##### Root Mean Squared Error

In [None]:
rmse = np.sqrt(mse)
rmse

##### Total Variance

In [None]:
sst = np.sum((y.mean()-y)**2)
sst

In [None]:
ssr = np.sum((y.mean()-ypred)**2)
ssr

In [None]:
sse = np.sum((y-ypred)**2)
sse

##### Variance explained by our model

In [None]:
ssr/sst

##### Plotting Mean Regression Line which is the worst fit line considering it as Null Hypothesis H0 

In [None]:
plt.plot(x,y,'*')
plt.plot(x,x*b1+b0)
plt.axhline(y.mean(), color='r')

In [None]:
x_c = sm.add_constant(x)

In [None]:
ols_model = sm.OLS(y,x_c).fit()

In [None]:
ols_model.summary()

In [None]:
data_to_predict = pd.DataFrame({'const':1, 'YearsExperience':[3.8,2,4.5,9,15]})
predicted_results = ols_model.predict(data_to_predict)
predicted_results