In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing and Understanding the Data

In [None]:
path="/kaggle/input/salary/Salary.csv"
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.rename(columns={'YearsExperience':"work_ex","Salary":"salary"},inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

## Visualizing with Plots

In [None]:
import plotly.express as px

px.histogram(x=df["salary"],nbins=10, labels={
                     "x": "Salary",
                 })

Understanding where the variation of datapoints in certain Salary Intervals

In [None]:
px.scatter(df,x="work_ex",y="salary")

We can clearly note a +ve Slope line. Which means Linear Regression is the best model for this dataset.

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df,x=df["work_ex"],y=df.salary)

Fitting the regression line on plot.

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df)

Clearly visible that there are no outliers in the given data.

## Creating a regression object

In [None]:
reg = linear_model.LinearRegression()
reg.fit(df[['work_ex']],df.salary)

In [None]:
coeff = reg.coef_
intercept = reg.intercept_
print("salary = "+str(coeff)+" * work_ex + "+str(intercept))

In [None]:
reg.predict([[1]])        #predicting Salary for an employee who has an work experience of 1 year

In [None]:
reg.predict([[11.5]])

In [None]:
reg.score()

## Training and Splitting

In [None]:
X = df.work_ex.values.reshape(-1,1)
y = df.salary.values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 100)

The data is divided into 4 sets: X_train, X_test, y_train, y_test, where X_train & y_train are used for training the model.

X_test and y_test are used for testing are model and checking the accuracy in prediction.

In [None]:
model = reg.fit(X_train,y_train)

In [None]:
score = reg.score(X_train,y_train)
print(score)

## Prediction for test data

In [None]:
pred = reg.predict(X_test)

In [None]:
pred_data = {"y_test":y_test,"y_pred":pred}
pd.DataFrame(data=pred_data)

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,pred)
print("R2 score: "+str(r2))

The R2 score which we got is High. Which means that our model is working fine.

Check Out my Multiple Linear Regression Notebook
- https://www.kaggle.com/amartyanambiar/beginner-multiple-linear-regression

Do you want to brush up your NumPy basics?
- https://www.kaggle.com/amartyanambiar/numpy-brush-up