# Very Simple Regression

## 1) IMPORT LIBRARIES AND DATASETS

In [None]:
# Import Pkgs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import files
files.upload()

In [None]:
# read the csv file 
salary_df = pd.read_csv('salary.csv')

In [None]:
salary_df

In [None]:
salary_df.head()

In [None]:
salary_df.tail(8)

In [None]:
# Check the minimun salary
salary_df['Salary'].min()

## 2) EDA and isualizations

In [None]:
# check if there are any Null values
salary_df.isnull().sum()

In [None]:
# Check the dataframe info
salary_df.info()

In [None]:
# Statistical summary of the dataframe
salary_df.describe()

In [None]:
# number of years of experience corresponding to employees with maximim salary
max = salary_df[salary_df['Salary'] == salary_df['Salary'].max()]

In [None]:
max

In [None]:
# number of years of experience corresponding to employees with minimum salary
min = salary_df[salary_df['Salary'] == salary_df['Salary'].min()]

In [None]:
min

In [None]:
# Histogram Plotting (Data Distribution)
salary_df.hist(bins = 30, figsize = (20,10), color = 'r')

In [None]:
# Plot Pairplot (Variables Relationship)
sns.pairplot(salary_df)

In [None]:
# Correlation Matrix
corr_matrix = salary_df.corr()
sns.heatmap(corr_matrix, annot = True)
plt.show()

In [None]:
# Regression Plot with Seaborn (straight line fit between "salary" and "years of experience")
sns.regplot(x='YearsExperience',y='Salary',data=salary_df)

## 3) TRAINING AND TESTING DATA

In [None]:
X = salary_df[['YearsExperience']]
y = salary_df[['Salary']]

In [None]:
X

In [None]:
y

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X = np.array(X) 
y = np.array(y)

In [None]:
X.shape

In [None]:
# split the data into test and train sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=72)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
print(X)

In [None]:
# We can see that data have been shuffled by "train_test_split
X_train

## 4) Train a Linear Regression Model

In [None]:
# using linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression(fit_intercept = True) # Fit intercept is the "b" parameter (y = b + mx)
lr.fit(X_train, y_train)

In [None]:
# Checking the accuracy
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
mse

In [None]:
print('Linear Model Coefficient (m): ', lr.coef_)
print('Linear Model Coefficient (b): ', lr.intercept_)

## 5) EVALUATE TRAINED MODEL PERFORMANCE

In [None]:
y_pred

In [None]:
# Plotting the TRAIN DATA
plt.figure(figsize=(16,8))
plt.scatter(X_train, y_train, color = 'gray')
plt.plot(X_train, lr.predict(X_train), color = 'red')
plt.ylabel('Salary')
plt.xlabel('Number of Years of Experience')
plt.title('Salary vs. Years of Experience')

## 6) Making Predictions
 - Use the trained model to obtain the salary corresponding to eployees who have 5 years of experience

In [None]:
new_value = [[5.0]]
new_prediction = lr.predict(new_value)
new_prediction

## 7) Save the Model

In [None]:
import joblib

model_file = open("linear_regression_salary.pkl","wb")
joblib.dump(lr,model_file)
model_file.close()