In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/hitters/Hitters.csv")
df.head()

In [None]:
df.isnull().sum()
#


It appears that there is a null value in the "Salary" variable. Let's fill them with the average value of the variable with the "fillna" function.

In [None]:
df.fillna(df.mean(),inplace=True)

# Editing The Dataset

### Converting Categorical Variables To Numeric variables

Categorical variables are transformed with the "get_dummies" function in pandas

In [None]:
cat_variables = ["League","Division","NewLeague"]
dums = pd.get_dummies(df[cat_variables])
dums.head()

After the categorical variables are converted, any transformed new variable belonging to each variable is selected and deleted.

Because the value of the deleted variable can be understood by looking at the other transformed variables already remaining.

For example, if all non-deleted variables are 0, it means that the deleted variable must be 1.

In [None]:
dums.drop(["League_N","Division_W","NewLeague_N"],axis=1,inplace=True)

Then, a new data set is created by combining the data set with the newly created variables. The old categorical variables that have been transformed are removed from the data set because they are no longer needed.

In [None]:
final_df= pd.concat([df,dums],axis=1)
final_df.drop(cat_variables,axis=1,inplace=True)
final_df.head()

### Visualization

In [None]:
import seaborn as sns

In [None]:
sns.distplot(df.Salary,kde= False);
#This is the distribution of the "Salary" variable.

# PCR

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn import model_selection


Let's choose the dependent and independent variables.

In [None]:
x = final_df.drop("Salary",axis =1)
y = final_df.Salary


In this data set, we took the variable "Salary" as the dependent variable.

Now we need to divide our data set into "test" and "train". Because after training our program from the "train" set, we will test it with our "test" set to see the error value.

For this we will use the "train_test_split" function in the "scikit-learn" module.

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state=100)

Before using "PCR" in this dataset, the argument values ​​must be formatted with the "scale" function.

In [None]:
x_train_scaled = PCA().fit_transform(scale(x_train))
x_test_scaled = PCA().fit_transform(scale(x_test))

In [None]:
pcr_model = LinearRegression().fit(x_train_scaled,y_train)
y_pred = pcr_model.predict(x_test_scaled)
np.sqrt(mean_squared_error(y_test,y_pred))


After setting up our model, we looked at our primitive error rate. Let's now use visualization to find the best parameter for this model.

In [None]:
from sklearn import model_selection
cv_10 = model_selection.KFold(n_splits=10,shuffle = True,random_state=1)

In [None]:
lm = LinearRegression()
RMSE = []
for i in np.arange(1,x_train_scaled.shape[1]+1):
    score = np.sqrt(-1*model_selection.cross_val_score(lm,
                                                      x_train_scaled[:,:i],
                                                      y_train.ravel(),
                                                      cv=cv_10,
                                                      scoring="neg_mean_squared_error").mean())
    RMSE.append(score)
    
plt.plot(RMSE,"-v");

Creates a model with all variable arrays in dependent variables and checks error rate. We set up a new model according to the lowest error value in this graph.

As seen in the graph, the lowest error rate was "17".

In [None]:
pcr_model = lm.fit(x_train_scaled[:,:17],y_train)
y_pred = pcr_model.predict(x_train_scaled[:,:17])
np.sqrt(mean_squared_error(y_train,y_pred))

In [None]:
y_pred = pcr_model.predict(x_test_scaled[:,:17])
np.sqrt(mean_squared_error(y_test,y_pred))

# PLS

In [None]:
from sklearn.cross_decomposition import PLSRegression,PLSSVD 

In [None]:
pls_model = PLSRegression(n_components=6).fit(x_train, y_train)

In [None]:
?PLSRegression

In [None]:
y_pred = pls_model.predict(x_train)
np.sqrt(mean_squared_error(y_train,y_pred))

In [None]:
cv_10 =model_selection.KFold(n_splits=10,shuffle=True,random_state=10)

RMSE =list()
for i in np.arange(1,x_train.shape[1]+1):
    pls = PLSRegression(n_components=i)
    score = np.sqrt(-1*model_selection.cross_val_score(pls,x_train,y_train,cv=cv_10,scoring="neg_mean_squared_error").mean())
    RMSE.append(score)

plt.plot(np.arange(1,x_train.shape[1]+1),np.array(RMSE),"-v",c="r");

![](http://)One of the most important parameters in the PLS model is "n_components". Therefore, we try to find the most suitable result by trying each value for this parameter.

It seems that the most suitable value in this graph is "12".

In [None]:
pls_model = PLSRegression(n_components=12).fit(x_train,y_train)
y_pred = pls_model.predict(x_test)
np.sqrt(mean_squared_error(y_test,y_pred))