********************************************************************************************************************************************************
                                                                    Q1
********************************************************************************************************************************************************

In [28]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Create 7 highly correlated features using 'MedInc' (Median Income)
for i in range(1, 8):
    df[f'Corr_Feature_{i}'] = df['MedInc'] + np.random.normal(0, 0.05, len(df))

# Target variable
df['Target'] = df['MedHouseVal']

# Check correlation of these 7 features + target
corr = df[[f'Corr_Feature_{i}' for i in range(1, 8)] + ['Target']].corr()
print(corr)


                Corr_Feature_1  Corr_Feature_2  Corr_Feature_3  \
Corr_Feature_1        1.000000        0.999310        0.999319   
Corr_Feature_2        0.999310        1.000000        0.999314   
Corr_Feature_3        0.999319        0.999314        1.000000   
Corr_Feature_4        0.999312        0.999305        0.999303   
Corr_Feature_5        0.999312        0.999307        0.999305   
Corr_Feature_6        0.999308        0.999309        0.999305   
Corr_Feature_7        0.999312        0.999308        0.999311   
Target                0.687865        0.687801        0.687759   

                Corr_Feature_4  Corr_Feature_5  Corr_Feature_6  \
Corr_Feature_1        0.999312        0.999312        0.999308   
Corr_Feature_2        0.999305        0.999307        0.999309   
Corr_Feature_3        0.999303        0.999305        0.999305   
Corr_Feature_4        1.000000        0.999306        0.999311   
Corr_Feature_5        0.999306        1.000000        0.999315   
Corr_Feat

In [29]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal', 'Corr_Feature_1',
       'Corr_Feature_2', 'Corr_Feature_3', 'Corr_Feature_4', 'Corr_Feature_5',
       'Corr_Feature_6', 'Corr_Feature_7', 'Target'],
      dtype='object')

In [30]:
X = df[['Corr_Feature_1',
       'Corr_Feature_2', 'Corr_Feature_3', 'Corr_Feature_4', 'Corr_Feature_5',
       'Corr_Feature_6', 'Corr_Feature_7']]
y= df['Target']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)




In [31]:
from sklearn.base import BaseEstimator, RegressorMixin 
from sklearn.metrics import r2_score, make_scorer

In [32]:
import warnings
warnings.filterwarnings('ignore')


class RidgeGD(BaseEstimator, RegressorMixin):
    def __init__(self, learning_rate=0.01, alpha=1.0, epochs=1000):
        self.learning_rate = learning_rate
        self.alpha = alpha
        self.epochs = epochs
    
    def fit(self, X, y):
        X = np.c_[np.ones(X.shape[0]), X]
        self.w_ = np.zeros(X.shape[1])
        m = len(y)
        
        for _ in range(self.epochs):
            y_pred = X @ self.w_
            grad = (-2/m) * (X.T @ (y - y_pred)) + 2 * self.alpha * self.w_
            self.w_ -= self.learning_rate * grad
        return self

    def predict(self, X):
        X = np.c_[np.ones(X.shape[0]), X]
        return X @ self.w_


param_grid = {
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'alpha': [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]
}

from sklearn.model_selection import GridSearchCV

ridge_gd = RidgeGD(epochs=500)
grid = GridSearchCV(estimator=ridge_gd,
                    param_grid=param_grid,
                    scoring=make_scorer(r2_score),
                    cv=3,
                    verbose=1)

grid.fit(X_train, y_train)


Fitting 3 folds for each of 48 candidates, totalling 144 fits


0,1,2
,estimator,RidgeGD(epochs=500)
,param_grid,"{'alpha': [1e-15, 1e-10, ...], 'learning_rate': [0.0001, 0.001, ...]}"
,scoring,make_scorer(r...hod='predict')
,n_jobs,
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,learning_rate,0.001
,alpha,1e-15
,epochs,500.0


In [33]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
best_r2 = r2_score(y_test, y_pred)


print(grid.best_params_)
print(f"Best R² Score on Test Data: {best_r2:.4f}")

{'alpha': 1e-15, 'learning_rate': 0.001}
Best R² Score on Test Data: 0.4530


********************************************************************************************************************************************************
                                                                    Q2
********************************************************************************************************************************************************

In [34]:
import pandas as pd
import numpy as np
df = pd.read_csv('Hitters.csv')
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [35]:
df[df.isnull().any(axis=1)]

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
15,183,39,3,20,15,11,3,201,42,3,20,16,11,A,W,118,0,0,,A
18,407,104,6,57,43,65,12,5233,1478,100,643,658,653,A,W,912,88,9,,A
22,22,10,1,4,2,1,6,84,26,2,9,9,3,A,W,812,84,11,,A
30,313,84,9,42,30,39,17,6890,1833,224,1033,864,1087,A,W,127,221,7,,A
32,517,141,27,70,87,52,9,3571,994,215,545,652,337,N,W,1378,102,8,,N
36,161,36,0,19,10,17,4,1053,244,3,156,86,107,A,E,70,149,12,,A
38,346,98,5,31,53,30,16,5913,1615,235,784,901,560,A,E,0,0,0,,A
39,241,61,1,34,12,14,1,241,61,1,34,12,14,N,W,166,172,10,,N
41,216,54,0,21,18,15,18,7318,1926,46,796,627,483,N,W,103,84,5,,N


In [36]:
df.columns

Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
       'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',
       'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],
      dtype='object')

In [37]:
df = df.dropna().reset_index(drop = True)

# Select only numeric columns 
numeric_cols = df.select_dtypes(include=[np.number]).columns

Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)].reset_index(drop=True)


In [38]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()


y = df['Salary']
X = df.drop(columns=['Salary'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train['League'] = le1.fit_transform(X_train['League'])
X_test['League'] = le1.transform(X_test['League'])

X_train['Division'] = le1.fit_transform(X_train['Division'])
X_test['Division'] = le1.transform(X_test['Division'])

X_train['NewLeague'] = le1.fit_transform(X_train['NewLeague'])
X_test['NewLeague'] = le1.transform(X_test['NewLeague'])

X_train.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,NewLeague
12,413,92,16,72,48,65,1,413,92,16,72,48,65,1,0,280,9,5,1
41,424,119,6,57,46,13,9,3651,1046,32,461,301,112,0,0,224,286,8,1
168,633,210,6,91,56,59,6,3070,872,19,420,230,274,1,1,367,432,16,1
190,497,127,7,65,48,37,5,2703,806,32,379,311,138,1,0,325,9,3,1
185,279,69,4,35,31,32,4,1359,355,31,180,148,158,1,0,133,173,9,1


In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train

array([[ 0.27969297, -0.17880391,  0.8753081 , ..., -0.76241148,
        -0.53007292,  0.96362411],
       [ 0.35906313,  0.452116  , -0.41190969, ...,  1.2405657 ,
        -0.08095256,  0.96362411],
       [ 1.86709623,  2.57854978, -0.41190969, ...,  2.29628653,
         1.11670174,  0.96362411],
       ...,
       [ 1.56404652,  1.34007736,  0.8753081 , ...,  2.35413424,
         2.46406284,  0.96362411],
       [-1.22112465, -1.32380672, -0.92679681, ..., -0.78410437,
        -1.12890007,  0.96362411],
       [ 1.34036697,  1.43354697,  2.16252589, ..., -0.76241148,
        -0.08095256,  0.96362411]])

In [40]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# Ridge Regression (with alpha = 0.5748)
ridge_reg = Ridge(alpha=0.5748)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)

# Lasso Regression (with alpha = 0.5748)
lasso_reg = Lasso(alpha=0.5748)
lasso_reg.fit(X_train, y_train)
y_pred_lasso = lasso_reg.predict(X_test)

In [41]:
print("Linear Regression")
print(f"R² Score: {r2_score(y_test, y_pred_lin):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lin):.4f}\n")

print("Ridge Regression (alpha=0.5748)")
print(f"R² Score: {r2_score(y_test, y_pred_ridge):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_ridge):.4f}\n")

print("Lasso Regression (alpha=0.5748)")
print(f"R² Score: {r2_score(y_test, y_pred_lasso):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lasso):.4f}")

Linear Regression
R² Score: 0.3763
MSE: 46540.0654

Ridge Regression (alpha=0.5748)
R² Score: 0.4017
MSE: 44644.3990

Lasso Regression (alpha=0.5748)
R² Score: 0.3852
MSE: 45874.2143


HENCE LASSO IS BEST

********************************************************************************************************************************************************
                                                                    Q3
********************************************************************************************************************************************************

In [42]:
from sklearn.datasets import fetch_california_housing  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error

data = fetch_california_housing(as_frame=True)
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [43]:
from sklearn.linear_model import RidgeCV
ridgecv = RidgeCV(cv =5)
ridgecv.fit(X_train,y_train)
ridgecv.alpha_

np.float64(0.1)

In [44]:
y_pred = ridgecv.predict(X_train)
y_pred

array([2.13765309, 1.76384789, 2.75116046, ..., 2.03898058, 2.84130169,
       2.2791261 ])

In [45]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv =5)
lassocv.fit(X_train,y_train)
lassocv.alpha_

np.float64(0.032090306899267056)

In [46]:
y_pred = lassocv.predict(X_train)
y_pred

array([2.29220522, 1.70360252, 2.69048009, ..., 1.92769919, 2.77692569,
       2.08347957])

********************************************************************************************************************************************************
                                                                    Q4
********************************************************************************************************************************************************

In [47]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)

df['target'] = iris.target

print(df.head(5))


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [48]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [49]:
X = df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)']]
y= df['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [50]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(multi_class = 'ovr')
logistic.fit(X_train,y_train)
y_pred = logistic.predict(X_test)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2, 1, 0,
       0])