In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.datasets import _california_housing
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import GridSearchCV

In [3]:
data = _california_housing.fetch_california_housing(as_frame=True).frame


In [4]:
scaler=StandardScaler(with_mean=True, with_std=True)

In [5]:
data=scaler.fit_transform(data)

In [6]:
data = pd.DataFrame(data, columns=_california_housing.fetch_california_housing()['feature_names'] + ['MedHouseVal'])

In [7]:
data.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [8]:
X=data.drop(columns='MedHouseVal')
y=data['MedHouseVal']

In [9]:
y

0        2.129631
1        1.314156
2        1.258693
3        1.165100
4        1.172900
           ...   
20635   -1.115804
20636   -1.124470
20637   -0.992746
20638   -1.058608
20639   -1.017878
Name: MedHouseVal, Length: 20640, dtype: float64

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y, test_size=0.4, shuffle=False, random_state=1)

In [None]:
LinearRegression().fit(X_train,y_train).score(X_test,y_test)

In [None]:
# rmse
np.sqrt(np.mean((LinearRegression().fit(X_train,y_train).predict(X_test)-y_test)**2))

In [None]:
y_pred=LinearRegression().fit(X_train,y_train).predict(X_test)
# variance score
print(r2(y_test,y_pred))
# max error
print(np.max(np.abs(y_test-y_pred)))
# mean absolute error
print(np.mean(np.abs(y_test-y_pred)))
# mean squared error
print(np.mean((y_test-y_pred)**2))

In [None]:
# SGD regression
from sklearn.linear_model import SGDRegressor
sgd=SGDRegressor(random_state=1)
sgd.fit(X_train,y_train)
y_pred=sgd.predict(X_test)
# bias term 
print(sgd.intercept_)
# weights
print(sgd.coef_)

In [None]:
# tunning the hyperparameters
from sklearn.linear_model import SGDRegressor
sgd=SGDRegressor(random_state=1)
parameters = {'alpha':[0.1, 0.01, 0.001], 'max_iter':[1000,2000,5000],'penalty':['l1','l2']}
clf = GridSearchCV(sgd, parameters,cv=4)
clf.fit(X_train, y_train)
best_params=clf.best_params_
print(best_params)

In [None]:
sgd_best = SGDRegressor(alpha=best_params['alpha'], max_iter=best_params['max_iter'], penalty=best_params['penalty'])
sgd_best.fit(X_train, y_train)
# acuuracy
print(sgd_best.score(X_test,y_test))
# y_pred=sgd_best.predict(X_test)
# bias term
# print(sgd_best.intercept_)
# weights
# print(sgd_best.coef_)
# score
# print(sgd_best.score(X_test,y_test))

In [None]:
# ridge regression
from sklearn.linear_model import Ridge
parameters = {'alpha':[0.5,0.1,0.05,0.01,0.005,0.001],'max_iter':[1000,10000,100000]} 
ridge = Ridge()
clf = GridSearchCV(ridge, parameters,cv=4)
clf.fit(X_train, y_train)
best_params=clf.best_params_
print(best_params)

In [None]:
# scoring the model based on the best parameters
ridge_best = Ridge(alpha=best_params['alpha'], max_iter=best_params['max_iter'])
ridge_best.fit(X_train, y_train)
# accuracy
print(ridge_best.score(X_test,y_test))

In [10]:
# perfroming split again
x_train,x_test,y_train,y_test= train_test_split(X,y, test_size=0.4, shuffle=False, random_state=1)

In [17]:
# tunning the hyperparameters for lasso regression  
# from sklearn.linear_model import Lasso
parameters = {'alpha':[0.5,0.1,0.05,0.01,0.005,0.001],'max_iter':[1000,10000,100000]}
lasso = Lasso() 
clf1 = GridSearchCV(lasso, parameters,cv=6)
clf1.fit(x_train, y_train)   
best_params1=clf1.best_params_    
print(best_params1)

# scoring the model based on the best parameters
lasso_best = Lasso(alpha=best_params1['alpha'], max_iter=best_params1['max_iter'],fit_intercept=True)
lasso_best.fit(x_train, y_train)
# accuracy
print(lasso_best.score(x_test,y_test))

{'alpha': 0.05, 'max_iter': 1000}
0.4521947009273938


In [None]:
oe=OrdinalEncoder()

In [None]:
df=pd.read_csv(r"C:\Users\vvagh\OneDrive - Indian Institute of Science Education and Research Bhopal\Documents\IITM Stuff\diploma-LittleBeasty\dataset.csv")
# df.describe()
df

In [None]:
df.replace('?', np.nan, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df[df.isna().any(axis=1)]

In [None]:
df.dropna(inplace=True)

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler, OrdinalEncoder
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression

# Encode target
y = oe.fit_transform(df[['Target']]).ravel()

# Drop the target column from X
X = df.drop('Target', axis=1)

# Define the columns for preprocessing
impute_cols = [0, 1]
scaler_cols = [0, 1, 2, 3]
onehot_cols = [4]

# Define the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('mean_imputer', SimpleImputer(strategy='mean'), impute_cols),
        ('scaler', StandardScaler(), scaler_cols),
        ('ordinalEncoder', OrdinalEncoder(), onehot_cols)
    ]
)

# Define the pipeline with preprocessing, RFE, and LogisticRegression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('sfs', SequentialFeatureSelector(LogisticRegression(), n_features_to_select=2, direction='backward')),
    ('logreg', LogisticRegression())
])

# Fit the pipeline
pipeline.fit(X, y)

# Get the transformed features
preprocessed_X = pipeline.named_steps['preprocessor'].transform(X)
print("Shape of transformed X:", preprocessed_X.shape)

# Get selected features after RFE
selected_feature_indices = pipeline.named_steps['sfs'].get_support(indices=True)

# Print the selected features
print(f"Selected features: {selected_feature_indices}")


In [None]:
selected_feature_indices

In [None]:
X.columns[selected_feature_indices]