## Analysis of Hitters Dataset

### 1. Pre-process the data

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/mnt/data/Hitters.csv')
data.head()

#### Handle missing values and encode categorical columns

In [None]:

from sklearn.preprocessing import LabelEncoder

# Drop rows with missing target values
data.dropna(subset=['Salary'], inplace=True)

# Convert categorical columns to numerical values
label_encoders = {}
categorical_columns = ['League', 'Division', 'NewLeague']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

data.head()


### 2. Separate input and output features, and scale them

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate input and output features, and scale them
X = data.drop('Salary', axis=1)
y = data['Salary']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled[:5], y_train.head()


### 3. Fit Linear, Ridge, and LASSO regression models

In [None]:

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

# Initialize the models
linear_reg = LinearRegression()
ridge_reg = Ridge(alpha=0.5748)
lasso_reg = Lasso(alpha=0.5748)

# Fit the models
linear_reg.fit(X_train_scaled, y_train)
ridge_reg.fit(X_train_scaled, y_train)
lasso_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred_linear = linear_reg.predict(X_test_scaled)
y_pred_ridge = ridge_reg.predict(X_test_scaled)
y_pred_lasso = lasso_reg.predict(X_test_scaled)

# Calculate MSE for each model
mse_linear = mean_squared_error(y_test, y_pred_linear)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

mse_linear, mse_ridge, mse_lasso


### 4. Conclusion

From the results, Ridge Regression performs the best on the test set among the three models.