**Problem Statement:**
You work in XYZ Company as a Python Data Scientist. The company officials have collected some data 
on salaries based on year of experience and wish for you to create a model from it.

**Dataset:** assignment_1.csv

**Tasks to be performed:**
1. Load the dataset using pandas
2. Extract data fromYearsExperience column is a variable named X
3. Extract data from salary column is a variable named Y
4. Divide the dataset into two parts for training and testing in 66% and 33% proportion
5. Create and train LinearRegression Model on training set
6. Make predictions based on the testing set using the trained model
7. Check the performance by calculating the r2 score of the model

In [1]:
# import libraries
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
# load dataset
dataset = pd.read_csv('data/assignment_1.csv')

In [3]:
dataset.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [4]:
dataset.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [5]:
# processing the data to normalize the dataset
scaler = MinMaxScaler()
processing_model = scaler.fit(dataset)
scaled_dataset = processing_model.transform(dataset)
scaled_dataset

array([[0.        , 0.01904087],
       [0.0212766 , 0.1000945 ],
       [0.04255319, 0.        ],
       [0.09574468, 0.06843846],
       [0.11702128, 0.02551382],
       [0.19148936, 0.22337586],
       [0.20212766, 0.26481219],
       [0.22340426, 0.19742499],
       [0.22340426, 0.31554453],
       [0.27659574, 0.229837  ],
       [0.29787234, 0.30105126],
       [0.30851064, 0.21335932],
       [0.30851064, 0.22709662],
       [0.31914894, 0.2285613 ],
       [0.36170213, 0.27616348],
       [0.40425532, 0.35680369],
       [0.42553191, 0.33425467],
       [0.44680851, 0.53575478],
       [0.5106383 , 0.51537916],
       [0.5212766 , 0.66393811],
       [0.60638298, 0.63792818],
       [0.63829787, 0.7151193 ],
       [0.72340426, 0.75089771],
       [0.75531915, 0.89866525],
       [0.80851064, 0.84691708],
       [0.84042553, 0.80145287],
       [0.89361702, 0.93595559],
       [0.90425532, 0.88476258],
       [0.9787234 , 1.        ],
       [1.        , 0.9938696 ]])

In [6]:
final_df = pd.DataFrame(scaled_dataset, columns=['YearsExperience', 'Salary'])
final_df.head()

Unnamed: 0,YearsExperience,Salary
0,0.0,0.019041
1,0.021277,0.100094
2,0.042553,0.0
3,0.095745,0.068438
4,0.117021,0.025514


In [7]:
final_df.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,0.448227,0.452067
std,0.301903,0.323818
min,0.0,0.0
25%,0.223404,0.224306
50%,0.382979,0.3249
75%,0.702128,0.741953
max,1.0,1.0


In [16]:
# split data into training and test (training = 66% and test = 33%)
X = final_df['YearsExperience']
Y = final_df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20,)
(10,)
(20,)
(10,)


In [17]:
# reshaping data
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [18]:
# create the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# prediction
y_pred = regressor.predict(X_test)

In [19]:
# calculate the accuracy using R2 score
r2 = r2_score(y_test, y_pred)
print('R2 Score: ', r2)

R2 Score:  0.9240850478446319
