In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Partial Least Squares Regression (PLS)


The regression model is constructed by reducing the variables to fewer components and without multiple linear connection problems. Components are created in a way that summarizes the covariance with the dependent variable in the highest way.
We should use Cross-Validation (CV) to determine the optimum number of components.

## Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [None]:
hit = pd.read_csv("/kaggle/input/hitters/Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.cross_decomposition import PLSRegression, PLSSVD

In [None]:
pls_model = PLSRegression().fit(X_train, y_train)

In [None]:
pls_model.coef_

## Prediction

In [None]:
X_train.head()

In [None]:
pls_model.predict(X_train)[0:10]

In [None]:
y_pred = pls_model.predict(X_train)

In [None]:
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
r2_score(y_train, y_pred)

In [None]:
y_pred = pls_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

## Model Tuning

In [None]:
#CV
cv_10 = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)


#loop to calculate error
RMSE = []

for i in np.arange(1, X_train.shape[1] + 1):
    pls = PLSRegression(n_components=i)
    score = np.sqrt(-1*cross_val_score(pls, X_train, y_train, cv=cv_10, scoring='neg_mean_squared_error').mean())
    RMSE.append(score)

#Visualization of results
plt.plot(np.arange(1, X_train.shape[1] + 1), np.array(RMSE), '-v', c = "r")
plt.xlabel('Components')
plt.ylabel('RMSE')
plt.title('Salary');

If the graph is examined, RMSE values for the salary variable and the number of components are observed. We can say that if the number of components is a value like 2, it gives a lower RMSE value compared to other cases.

In [None]:
pls_model = PLSRegression(n_components = 2).fit(X_train, y_train)

In [None]:
y_pred = pls_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))