In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set the seed for reproducibility
np.random.seed(0)

# Creating the DataFrame
data = {
    'X1': np.random.randint(1, 100, 12),
    'X2': np.random.randint(1, 100, 12),
}

df = pd.DataFrame(data)

# Assuming a linear relation: Y = 2*X1 + 3*X2 + some_noise
df['Y'] = round(2*df['X1'] + 3*df['X2'] + np.random.normal(0, 10, 12))

# Add categorical 'Color' feature
colors = ['Red', 'Blue', 'Green', 'Yellow']
df['Color'] = np.random.choice(colors, 12)

# Add categorical 'Shape' feature
shapes = ['Circle', 'Square', 'Triangle']
df['Shape'] = np.random.choice(shapes, 12)

# Display the DataFrame
df

Unnamed: 0,X1,X2,Y,Color,Shape
0,45,89,369.0,Green,Circle
1,48,13,122.0,Red,Square
2,65,59,311.0,Yellow,Square
3,68,66,333.0,Yellow,Triangle
4,68,40,270.0,Green,Circle
5,10,88,283.0,Yellow,Circle
6,84,47,329.0,Green,Square
7,22,89,313.0,Yellow,Circle
8,37,82,326.0,Red,Square
9,88,38,306.0,Green,Triangle


In [5]:
# One-hot encode the categorical features
df = pd.get_dummies(df, columns=['Color', 'Shape'], drop_first=True)

# Split data into features (X) and target (y)
X = df.drop('Y', axis=1)
y = df['Y']

In [6]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit a linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting on the test data
y_pred = model.predict(X_test)

# Print the coefficients
print("Intercept:", model.intercept_)
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")

Intercept: 2.3413119081529885
X1: 2.0638598444746816
X2: 3.1168872224208095
Color_Red: -12.223009439396433
Color_Yellow: -15.155723559017392
Shape_Square: 2.1997398728850124
Shape_Triangle: 7.2210912211656355


In [7]:
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Absolute Error (MAE): 3.39
Mean Squared Error (MSE): 12.22
Root Mean Squared Error (RMSE): 3.50
R-squared: 1.00


In [8]:
new_data = {
    'X1': [45, 52],
    'X2': [65, 74],
    'Color': ['Red', 'Yellow'],
    'Shape': ['Triangle', 'Square']
}
new_df = pd.DataFrame(new_data)
new_df

Unnamed: 0,X1,X2,Color,Shape
0,45,65,Red,Triangle
1,52,74,Yellow,Square


In [9]:
new_df_encoded = pd.get_dummies(new_df, columns=['Color', 'Shape'])

# for col in X.columns:
#     if col not in new_df_encoded.columns:
#         new_df_encoded[col] = 0
new_df_encoded

Unnamed: 0,X1,X2,Color_Red,Color_Yellow,Shape_Square,Shape_Triangle
0,45,65,1,0,0,1
1,52,74,0,1,1,0


In [10]:
df.head(3)

Unnamed: 0,X1,X2,Y,Color_Red,Color_Yellow,Shape_Square,Shape_Triangle
0,45,89,369.0,0,0,0,0
1,48,13,122.0,1,0,1,0
2,65,59,311.0,0,1,1,0


In [11]:
new_y_pred = model.predict(new_df_encoded)
print(new_y_pred)

[292.81075615 327.35569459]


# Regularization

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Sample data creation and preprocessing are omitted for brevity

# Initialize models
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)
elastic = ElasticNet(alpha=1.0, l1_ratio=0.5)

models = [lr, ridge, lasso, elastic]
model_names = ["Linear", "Ridge", "Lasso", "ElasticNet"]
metrics = {"Model": [], "MAE": [], "MSE": [], "R2": []}

# Fit the models and get the metrics
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    metrics["Model"].append(name)
    metrics["MAE"].append(mae)
    metrics["MSE"].append(mse)
    metrics["R2"].append(r2)

# Convert metrics to DataFrame for display
results = pd.DataFrame(metrics)
print(results)


        Model        MAE         MSE        R2
0      Linear  20.905271  858.512780  0.754258
1       Ridge  13.095948  215.921309  0.938194
2       Lasso  11.291777  158.561893  0.954613
3  ElasticNet  11.305200  152.701797  0.956290


# Summary Table using Statsmodels

In [13]:
import statsmodels.api as sm

# # Add a constant column for the intercept
# X = sm.add_constant(df[['X1', 'X2', 'Category_1', 'Category_2']])

# Fit the OLS model
model_SM = sm.OLS(df['Y'], X).fit()

# Get the summary
print(model_SM.summary())

                                 OLS Regression Results                                
Dep. Variable:                      Y   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              0.999
Method:                 Least Squares   F-statistic:                              3118.
Date:                Wed, 01 Nov 2023   Prob (F-statistic):                    3.30e-10
Time:                        02:52:10   Log-Likelihood:                         -37.552
No. Observations:                  12   AIC:                                      87.10
Df Residuals:                       6   BIC:                                      90.01
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

# Set the seed for reproducibility
np.random.seed(0)

# Creating the DataFrame
data = {
    'X1': np.random.randint(1, 100, 12),
    'X2': np.random.randint(1, 100, 12),
}
df = pd.DataFrame(data)

# Assuming a linear relation: Y = 2*X1 + 3*X2 + some_noise
df['Y'] = round(2*df['X1'] + 3*df['X2'] + np.random.normal(0, 10, 12))

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['X1', 'X2']], df['Y'], test_size=0.2, random_state=0)

# Creating Ridge and Lasso models with hyperparameters to tune and cross-validation
models_params = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1, 10, 100]  # Regularization strength values to be tuned
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 1, 10, 100]  # Regularization strength values to be tuned
        }
    }
}

# Applying GridSearchCV for each model
for model_name, mp in models_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5)  # 5-Fold CV
    clf.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {clf.best_params_}")
    print(f"Train score: {clf.score(X_train, y_train)}")
    print(f"Test score: {clf.score(X_test, y_test)}\n")



Best parameters for Ridge: {'alpha': 0.1}
Train score: 0.9904322871963563
Test score: 0.9573904290771045

Best parameters for Lasso: {'alpha': 0.1}
Train score: 0.9904322691116606
Test score: 0.9574407088260025

