## `Online News`

### Importing libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
    )
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor


import pandas as pd
import numpy as np
import time

### Preprocessing

In [2]:
# Load the bike sharing dataset
online_news = pd.read_csv('../datasets/original/OnlineNewsPopularity.csv')

# Drop url since it functions as an ID
online_news = online_news.drop(['url'], axis=1)

### Splitting the dataset and normalization

In [3]:
# Split the dataset into features and target variable
X = online_news.drop('shares', axis=1)
y = online_news['shares']

# Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalize the data
scaler = MinMaxScaler()


# Fit transform all variables
X_train_scaled = scaler.fit_transform(X_train) 
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test =  pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train
 

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,0.946058,0.333333,0.030622,0.000729,0.00096,0.000827,0.121711,0.034483,0.062500,0.010989,...,0.250000,0.250000,0.25,0.785417,0.5000,0.900000,0.000000,0.500000,1.000000,0.000000
1,0.917012,0.380952,0.035343,0.000913,0.00096,0.001151,0.023026,0.000000,0.007812,0.000000,...,0.339526,0.100000,1.00,0.613580,0.2000,0.844444,0.850000,0.545833,0.700000,0.091667
2,0.825726,0.333333,0.022258,0.001079,0.00096,0.001292,0.013158,0.025862,0.000000,0.010989,...,0.281250,0.100000,0.40,0.900000,0.8750,0.950000,0.000000,0.500000,1.000000,0.000000
3,0.160443,0.380952,0.127209,0.000671,0.00096,0.000978,0.042763,0.017241,0.007812,0.000000,...,0.297987,0.100000,0.60,0.782372,0.7000,0.875000,0.175000,0.500000,0.650000,0.000000
4,0.651452,0.476190,0.035343,0.000810,0.00096,0.001194,0.023026,0.051724,0.000000,0.010989,...,0.292208,0.100000,0.60,0.729630,0.5000,0.844444,0.000000,0.500000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29728,0.840941,0.285714,0.016592,0.001032,0.00096,0.001322,0.009868,0.000000,0.000000,0.010989,...,0.352778,0.166667,0.60,1.000000,1.0000,1.000000,0.000000,0.500000,1.000000,0.000000
29729,0.702628,0.428571,0.029408,0.000891,0.00096,0.001102,0.046053,0.017241,0.078125,0.000000,...,0.510606,0.250000,1.00,0.852083,0.8125,0.950000,0.694444,0.687500,0.388889,0.375000
29730,0.037344,0.428571,0.023202,0.000951,0.00096,0.001195,0.016447,0.025862,0.015625,0.000000,...,0.357143,0.214286,0.50,0.895767,0.6000,0.950000,0.150000,0.475000,0.700000,0.050000
29731,0.979253,0.238095,0.012141,0.001234,0.00096,0.001420,0.013158,0.025862,0.000000,0.021978,...,0.386667,0.100000,0.50,0.800000,0.8000,0.800000,0.500000,0.625000,0.000000,0.250000


## Tests

## Fining optimal K

In [4]:
# Optimal K with cross-validation cross_val_score
k_values = list(range(1, 30))
cross_val_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
    cross_val_scores.append(cross_val_score(knn, X_train, y_train, cv=10).mean())
    
# optimal_k = k_values[np.argmax(cross_val_scores)]


# Get the index of the top 3 scores
top_3_scores = np.argsort(cross_val_scores)[::-1][:3]

# Get the k values for the top 3 scores
top_3_k_values = [k_values[i] for i in top_3_scores]

top_3_k_values

[29, 28, 27]

## Attributes importance

In [5]:
# Train a Random Forest model to get which Features are more important
model = RandomForestRegressor(max_leaf_nodes=2,
                        max_features=5,
                        max_depth=5,
                        random_state=42)

# Fit the model to the data
model.fit(X_train, y_train)

# Get the most important Features
most_important_attributes = pd.DataFrame(
                            model.feature_importances_,
                            index = X_train.columns,
                            columns=['importance']
                        ).sort_values('importance', ascending=False)


most_important_attributes


Unnamed: 0,importance
self_reference_avg_sharess,0.1
num_hrefs,0.09
LDA_03,0.08
kw_avg_avg,0.07
self_reference_min_shares,0.07
kw_min_avg,0.06
n_tokens_content,0.06
self_reference_max_shares,0.05
kw_max_avg,0.05
num_imgs,0.05


In [6]:
# Drop the least important features

least_important_features = [
'global_rate_negative_words',
'rate_positive_words',
'rate_negative_words',
'is_weekend',
'min_positive_polarity',
'max_positive_polarity',
'global_sentiment_polarity',
'avg_negative_polarity',
'min_negative_polarity',
'abs_title_subjectivity',
'LDA_00',
'timedelta',
'weekday_is_sunday',
'kw_min_min',
'n_non_stop_words',
'average_token_length',
'num_keywords',
'data_channel_is_lifestyle',
'data_channel_is_entertainment',
'data_channel_is_bus',
'data_channel_is_socmed',
'data_channel_is_tech',
'kw_max_min',
'weekday_is_saturday',
'kw_min_max',
'kw_max_max',
'n_tokens_title',
'weekday_is_monday',
'weekday_is_tuesday',
'weekday_is_wednesday',
'weekday_is_sunday',
'weekday_is_thursday',
'weekday_is_friday',
'abs_title_sentiment_polarity'
]

X_train = X_train.drop(least_important_features, axis=1)
X_test = X_test.drop(least_important_features, axis=1)

## KNN - sklearn

In [7]:
knn = KNeighborsRegressor(n_neighbors=6, weights='uniform')
knn.fit(X_train, y_train)

runtime_start = time.process_time()
runtime_end = time.process_time()

y_pred = knn.predict(X_test)
runtime = runtime_end - runtime_start

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2, runtime

(3166.312514714291,
 125856818.66668348,
 -0.0796551543529489,
 3.190000001040971e-05)

## Gradient Descent - sklearn

In [8]:
gd = SGDRegressor()

start = time.process_time()
gd.fit(X_train, y_train)
gd_pred = gd.predict(X_test)
end = time.process_time()

gd_runtime = end - start

gd_mae = mean_absolute_error(y_test, gd_pred)
gd_mse = mean_squared_error(y_test, gd_pred)
gd_r2 = r2_score(y_test, gd_pred)

gd_mae, gd_mse, gd_r2, gd_runtime

(3055.7504875313716,
 114249799.38556391,
 0.01991496291438133,
 0.13502783699999554)

## Decision Tree Regression - sklearn

In [9]:
dtr = DecisionTreeRegressor()

start = time.process_time()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)
end = time.process_time()

dtr_runtime = end - start

dtr_mae = mean_absolute_error(y_test, dtr_pred)
dtr_mse = mean_squared_error(y_test, dtr_pred)
dtr_r2 = r2_score(y_test, dtr_pred)

dtr_mae, dtr_mse, dtr_r2, dtr_runtime

(4243.018666128544, 196174292.20139238, -0.6828693746637835, 2.655865828000003)