<a href="https://colab.research.google.com/github/prabur90/machinelearning/blob/main/Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
!pip install kaggle



In [4]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"prabur90","key":"3dc4462a101e67bc5d4f65e8495c49c1"}'}

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c house-prices-advanced-regression-techniques

!unzip house-prices-advanced-regression-techniques.zip

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 65.9MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [6]:
# Load the dataset
data = pd.read_csv('train.csv')

# Select the relevant columns
data = data[['GrLivArea', 'SalePrice']]

# Drop any rows with missing values
data = data.dropna()

# Define the feature and target variable
X = data[['GrLivArea']]
y = data['SalePrice']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Training R²: {train_r2}')
print(f'Test R²: {test_r2}')

# Interpretation
if train_mse > test_mse:
    print("The model is underfitting.")
elif train_mse < test_mse:
    print("The model is overfitting.")
else:
    print("The model is fitting well.")


Training MSE: 3078116053.85812
Test MSE: 3418946311.180807
Training R²: 0.483931586143035
Test R²: 0.5542632452871117
The model is overfitting.


In [7]:
from sklearn.linear_model import Ridge

# Create the Ridge regression model with a chosen alpha value
ridge_model = Ridge(alpha=1.0)

# Train the model
ridge_model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = ridge_model.predict(X_train)
y_test_pred = ridge_model.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Training R²: {train_r2}')
print(f'Test R²: {test_r2}')


Training MSE: 3078116053.85812
Test MSE: 3418946315.2437005
Training R²: 0.483931586143035
Test R²: 0.5542632447574218


In [8]:
from sklearn.linear_model import Lasso

# Create the Lasso regression model with a chosen alpha value
lasso_model = Lasso(alpha=0.1)

# Train the model
lasso_model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = lasso_model.predict(X_train)
y_test_pred = lasso_model.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Training R²: {train_r2}')
print(f'Test R²: {test_r2}')


Training MSE: 3078116053.8581204
Test MSE: 3418946315.811022
Training R²: 0.4839315861430349
Test R²: 0.5542632446834588


In [10]:
from sklearn.model_selection import cross_val_score

# Perform 10-fold cross-validation for the Ridge model
ridge_model = Ridge(alpha=1.0)
cv_scores = cross_val_score(ridge_model, X, y, cv=10, scoring='neg_mean_squared_error')

print(f'Cross-Validation MSE: {-cv_scores.mean()}')
print(f'Cross-Validation Std Dev: {cv_scores.std()}')

Cross-Validation MSE: 3162284827.7104015
Cross-Validation Std Dev: 999756259.2975249
