Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# need to convert X and y to 2D array since the feature scaling function expects a 2D array as input
y = y.reshape(-1,1)

# we split before feature scaling since we want mean and SD of values only in the training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# we have to apply feature since there is an implicit relation bw feature variables & dependent variables
# we will apply scaling on both feature and dependent variables
# we create two obj of StandardScalar class since the mean and SD wil vary for both columns
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

---

Training SVR model on the whole set

In [2]:
from sklearn.svm import SVR

# we use radial basis function kernel, since our dataset has non-linear relationship (read more about these)
regressor = SVR(kernel='rbf')
regressor.fit(X_train, y_train)

SVR()

---

Predicting Test set results

In [4]:
y_pred_scaled = regressor.predict(sc_X.transform(X_test))

# we need to inversely transform the scaled predicted y(salary) using sc_y
y_pred = sc_y.inverse_transform(y_pred_scaled)

print(np.concatenate((y_pred.reshape(-1, 1), y_test.reshape(-1, 1)), axis=1))

[[434.05242921 431.23      ]
 [457.93810186 460.01      ]
 [461.03113894 461.14      ]
 ...
 [470.60268461 473.26      ]
 [439.41653548 438.        ]
 [460.91757115 463.28      ]]


---

Evaluating Model Performance

In [5]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.948078404998626