In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# load the data
df = pd.read_csv("https://raw.githubusercontent.com/HasanRoknabady/dataset-popularity-/main/OnlineNewsPopularity.csv")

# drop unnecessary columns
df.drop(columns=["url"], inplace=True)

# create target variable
df["shares"] = np.where(df["shares"] > 1400, 1, 0)

# split the data into train and test sets
X = df.drop(columns=["shares"])
y = df["shares"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define numerical pipeline
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2))
])

# define column transformer
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, X_train.columns)
])

# create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(f_regression, k=5)),
    ('ridge', Ridge())
])

# set the parameter grid for GridSearchCV
param_grid = {
    'selector__k': [5, 10, 15],
    'ridge__alpha': [0.1, 1, 10]
}

# create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

# fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

# print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# create RandomizedSearchCV object
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5)

# fit the RandomizedSearchCV object on the training data
random_search.fit(X_train, y_train)

# print the best parameters and the best score
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

# get the selected features
selector = grid_search.best_estimator_.named_steps['selector']
selected_features = selector.get_support()
feature_names = X_train.columns[selected_features]
print("Selected features:", feature_names)

# fit the final model using the selected features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

model = Ridge(alpha=grid_search.best_params_['ridge__alpha'])
model.fit(X_train_selected, y_train)

print("Train score:", model.score(X_train_selected, y_train))
print("Test score:", model.score(X_test_selected, y_test))
