In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
salary_data = pd.read_csv('../input/salary-data-simple-linear-regression/Salary_Data.csv')

In [None]:
salary_data.head()

In [None]:
# check null values, datatypes
salary_data.info()

In [None]:
# mean, min, max etc.
salary_data.describe()

In [None]:
salary_data.shape

In [None]:
salary_data.corr()

In [None]:
#create train test
X = salary_data['YearsExperience']
y = salary_data['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 100)

In [None]:
plt.scatter(X,y)

In [None]:
sns.regplot(x=X,y=y,data=salary_data)

In [None]:
sns.heatmap(salary_data.corr(), annot=True)

### Model Creation

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train_lm = X_train.values.reshape(-1,1)
X_test_lm = X_test.values.reshape(-1,1)

In [None]:
print(X_train_lm.shape)
print(X_test_lm.shape)

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train_lm, y_train)

In [None]:
print(lm.coef_)
print(lm.intercept_)

In [None]:
#predict
y_train_pred = lm.predict(X_train_lm)
y_test_pred = lm.predict(X_test_lm)

In [None]:
#train set
plt.scatter(X_train, y_train)
plt.plot(X_train_lm, y_train_pred, 'r')

In [None]:
#test set
plt.scatter(X_test, y_test)
plt.plot(X_test_lm, y_test_pred, 'b')

##### R-squared calc

In [None]:
from sklearn.metrics import r2_score

In [None]:
print(r2_score(y_true = y_train, y_pred = y_train_pred))
print(r2_score(y_true = y_test, y_pred = y_test_pred))

In [None]:
m = lm.coef_
c = lm.intercept_

#for 5 years of experience
print(c + m * 5)

#for 1 years of experience
print(c + m * 1)

#for 2.5 years of experience
print(c + m * 2.5)

#for 0 years of experience
print(c + m * 0)

In [None]:
y_pred = lm.predict(X.values.reshape(-1,1))

plt.plot(X, y, label = "Actual Line")
plt.plot(X, y_pred, label = "Predicted Line")

plt.xlabel('x - axis')
plt.ylabel('y - axis')
plt.title('Two lines on same graph!')
plt.legend()
plt.show()

#####  Model Creation using Statsmodel (OLS)

In [None]:
import statsmodels.api as sm

X_sm = sm.add_constant(X)
olm = sm.OLS(y, X_sm)                                   
olm = olm.fit()
olm.summary()