In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# PCR Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
hit = pd.read_csv("/kaggle/input/hitters/Hitters.csv")
df = hit.copy()
df = df.dropna()
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

The data set contains some information of American Basketball League players. There is a salary variable depending on the variables such as error, assist, experience, shooting belonging to the players.
We have to convert categorical variables to dummy variable format.

In [None]:
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dms.head() #one hot encoding

In [None]:
y = df["Salary"]

In [None]:
X_ = df.drop(["Salary","League","Division","NewLeague"], axis = 1).astype("float64")

In [None]:
X_.head()

We extracted the dependent variable and the initial states of the categorical variables from our data set.

In [None]:
X = pd.concat([X_, dms[["League_N", "Division_W","NewLeague_N"]]], axis = 1)
X.head()

We made the variables we have ready for modeling.

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)

print("X_train", X_train.shape)

print("y_train",y_train.shape)

print("X_test",X_test.shape)

print("y_test",y_test.shape)

training = df.copy()

print("training", training.shape)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale 
pca = PCA()

In [None]:
X_reduced_train = pca.fit_transform(scale(X_train))

In [None]:
X_reduced_train[0:1,:]

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)[0:5]

In [None]:
lm = LinearRegression()
pcr_model = lm.fit(X_reduced_train, y_train)
pcr_model.intercept_

In [None]:
pcr_model.coef_

## Prediction

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = pcr_model.predict(X_reduced_train)
y_pred[0:5]

In [None]:
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
df["Salary"].mean()

In [None]:
r2_score(y_train, y_pred)

In [None]:
pca2 = PCA()
X_reduced_test = pca2.fit_transform(scale(X_test))
y_pred = pcr_model.predict(X_reduced_test)
np.sqrt(mean_squared_error(y_test, y_pred))

## Model Tuning

In [None]:
lm = LinearRegression()
pcr_model = lm.fit(X_reduced_train[:,0:10], y_train)
y_pred = pcr_model.predict(X_reduced_test[:,0:10])
print(np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
from sklearn import model_selection
cv_10 = model_selection.KFold(n_splits = 10, shuffle = True, random_state = 1)
lm = LinearRegression()

In [None]:
RMSE = []

In [None]:
for i in np.arange(1, X_reduced_train.shape[1] + 1):
    
    score = np.sqrt(-1*model_selection.cross_val_score(lm, 
                                                       X_reduced_train[:,:i], 
                                                       y_train.ravel(), 
                                                       cv=cv_10, 
                                                       scoring='neg_mean_squared_error').mean())
    RMSE.append(score)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot(RMSE, '-v')
plt.xlabel('Components')
plt.ylabel('RMSE')
plt.title('PCR Model Tuning for Salary Estimation Model');

In [None]:
lm = LinearRegression()
pcr_model = lm.fit(X_reduced_train[:,0:6], y_train)
y_pred = pcr_model.predict(X_reduced_train[:,0:6])
print(np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
y_pred = pcr_model.predict(X_reduced_test[:,0:6])
print(np.sqrt(mean_squared_error(y_test, y_pred)))