In [None]:
data <- read.csv("/kaggle/input/online-news-popularity/OnlineNewsPopularity.csv")

In [None]:
library(caret)
library(MASS)
library(DAAG)
library(glmnet)
library(leaps)
install.packages("lmtest")
library(lmtest)
library(dplyr)
library(glmnet)
library(stats)


In [None]:
str(data)

In [None]:
colnames(data)

In [None]:
#Removing Empty Rows
has_zeros <- any(data$n_tokens_content == 0)
print(has_zeros)
df_no_zeros <- data[data$n_tokens_content != 0, ]
data <- df_no_zeros

In [None]:
# Data Cleaning 

data1 <- data
data1 <- subset(data1, select = -url)
data1 <- subset(data1, select = -timedelta)
data1 <- subset(data1, select = -n_non_stop_words)
data1 <- subset(data1, select = -LDA_00)
data1 <- subset(data1, select = -LDA_02)
data1 <- subset(data1, select = -LDA_01)
data1 <- subset(data1, select = -LDA_03)
data1 <- subset(data1, select = -LDA_04)
data1 <- subset(data1, select = -is_weekend)

data1 <- subset(data1, select = -c(
    kw_avg_min,
    kw_avg_max,
    kw_min_avg, kw_max_avg, kw_avg_avg ,self_reference_avg_sharess,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words
))

# Log-transform 'n_non_stop_unique_tokens'
data1$n_non_stop_unique_tokens <- log(data1$n_non_stop_unique_tokens + 1)


In [None]:
dim(data1)

In [None]:
# Creating interaction terms based on correlation
data1$useful_unique_non_tokens <- data1$n_unique_tokens * data1$n_non_stop_unique_tokens

In [None]:
#### Assuming your dataframe is named 'data'
cor_matrix <- cor(data1)

# Set the diagonal to NA to avoid self-correlation
diag(cor_matrix) <- NA

# Initialize a dataframe to store the results
high_cor_results <- data.frame()

# Find the indices of the high correlations
high_cor_indices <- which(cor_matrix > 0.7, arr.ind = TRUE)

# Loop through the indices and store the results
for (idx in 1:nrow(high_cor_indices)) {
    row <- high_cor_indices[idx, "row"]
    col <- high_cor_indices[idx, "col"]
    
    # Avoid duplicates by ensuring row index is less than column index
    if (row < col) {
        high_cor_results <- rbind(high_cor_results,
                                  data.frame(Column1 = colnames(cor_matrix)[row],
                                             Column2 = colnames(cor_matrix)[col],
                                             Correlation = cor_matrix[row, col]))
    }
}

# Print the high correlation pairs and their correlation values
print(high_cor_results)


    19. kw_min_min:                    Worst keyword (min. shares)
    20. kw_max_min:                    Worst keyword (max. shares)
    21. kw_avg_min:                    Worst keyword (avg. shares)
    22. kw_min_max:                    Best keyword (min. shares)
    23. kw_max_max:                    Best keyword (max. shares)
    24. kw_avg_max:                    Best keyword (avg. shares)
    25. kw_min_avg:                    Avg. keyword (min. shares)
    26. kw_max_avg:                    Avg. keyword (max. shares)
    27. kw_avg_avg:                    Avg. keyword (avg. shares)
    28. self_reference_min_shares:     Min. shares of referenced articles in Mashable
    29. self_reference_max_shares:     Max. shares of referenced articles in Mashable
    30. self_reference_avg_sharess:    Avg. shares of referenced articles in Mashable
    44. global_subjectivity:           Text subjectivity
    45. global_sentiment_polarity:     Text sentiment polarity
    46. global_rate_positive_words:    Rate of positive words in the content
    47. global_rate_negative_words:    Rate of negative words in the content
       48. rate_positive_words:           Rate of positive words among non-neutral tokens
    49. rate_negative_words:           Rate of negative words among non-neutral tokens
    50. avg_positive_polarity:         Avg. polarity of positive words
    51. min_positive_polarity:         Min. polarity of positive words
    52. max_positive_polarity:         Max. polarity of positive words
    53. avg_negative_polarity:         Avg. polarity of negative  words
    54. min_negative_polarity:         Min. polarity of negative  words
    55. max_negative_polarity:         Max. polarity of negative  words

In [None]:
df_1 <-data1
dim(df_1)

In [None]:
colnames(data1)
colnames(df_1)

In [None]:
dim(data)

In [None]:
simple_model <- lm(shares~., data = df_1)
summary(simple_model)

In [None]:
vif_values <- vif(simple_model)
predictor_names <- names(simple_model)
vif_df <- data.frame(Variable = names(vif_values), VIF = format(vif_values, scientific = FALSE))

# Print the formatted VIF values
print(vif_df)

In [None]:
model1 <- lm(log(shares)~., data = df_1)
summary(model1)

In [None]:
residuals = model1$residuals
head(residuals)

In [None]:
hist(model1$residuals, breaks = 5)


In [None]:
plot(model1)

In [None]:
# model2 <- lm(sqrt(shares)~.,data = df_1)
# summary(model2)

In [None]:
stepwise.model <- stepAIC(model1, direction="backward")

In [None]:
summary(stepwise.model)


In [None]:
colnames(df_1)

In [None]:
# Assuming data1 is your dataframe
y <- df_1$shares  # Create a y dataset containing the "shares" column
x <- df_1[,c(
    "n_tokens_title", 
    "n_unique_tokens", 
    "n_non_stop_unique_tokens", 
    "num_hrefs", 
    "num_self_hrefs", 
    "num_imgs", 
    "num_videos", 
    "average_token_length", 
    "num_keywords", 
    "data_channel_is_lifestyle", 
    "data_channel_is_entertainment", 
    "data_channel_is_bus", 
    "data_channel_is_socmed", 
    "data_channel_is_tech", 
    "data_channel_is_world", 
    "kw_min_min", 
    "kw_max_min", 
    "kw_max_max", 
    "self_reference_min_shares", 
    "self_reference_max_shares", 
    "weekday_is_monday", 
    "weekday_is_tuesday", 
    "weekday_is_wednesday", 
    "weekday_is_thursday", 
    "weekday_is_friday", 
    "global_subjectivity", 
    "min_positive_polarity", 
    "avg_negative_polarity", 
    "title_subjectivity", 
    "title_sentiment_polarity", 
    "abs_title_subjectivity", 
    "abs_title_sentiment_polarity"
)]


In [None]:
y1 <-log(y)


In [None]:
x <- scale(x)
head(x)

In [None]:
set.seed(123)
lasso <- cv.glmnet(x,y1,family="gaussian", alpha=1)
plot(lasso)

In [None]:
summary(lasso)

In [None]:
best_lambda <- lasso$lambda.min
# Make predictions using the best lambda
lasso_predictions <- predict(lasso, newx = x, s = best_lambda)

# Calculate Mean Squared Error (MSE)
mse <- mean((lasso_predictions - y1)^2)

# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mse)

# Calculate R-squared (R²)
r_squared <- 1 - mse / var(y1)

# Print or display the metrics
cat("Mean Squared Error (MSE):", mse, "\n")
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
cat("R-squared (R²):", r_squared, "\n")

In [None]:
# Fit Ridge model
set.seed(123)
ridge <- cv.glmnet(x, y1, family = "gaussian", alpha = 0)

# Choose the best lambda (penalty parameter) based on cross-validation
best_lambda <- ridge$lambda.min

# Make predictions using the best lambda
ridge_predictions <- predict(ridge, newx = x, s = best_lambda)

# Calculate Mean Squared Error (MSE)
mse_ridge <- mean((ridge_predictions - y1)^2)

# Calculate Root Mean Squared Error (RMSE)
rmse_ridge <- sqrt(mse_ridge)

# Calculate R-squared (R²)
r_squared_ridge <- 1 - mse_ridge / var(y1)

# Print or display the metrics
cat("Ridge Mean Squared Error (MSE):", mse_ridge, "\n")
cat("Ridge Root Mean Squared Error (RMSE):", rmse_ridge, "\n")
cat("Ridge R-squared (R²):", r_squared_ridge, "\n")


In [None]:
# Fit Elastic Net model
set.seed(123)
elastic_net <- cv.glmnet(x, y1, family = "gaussian", alpha = 0.5)

# Choose the best lambda (penalty parameter) based on cross-validation
best_lambda <- elastic_net$lambda.min

# Make predictions using the best lambda
elastic_net_predictions <- predict(elastic_net, newx = x, s = best_lambda)

# Calculate Mean Squared Error (MSE)
mse_elastic_net <- mean((elastic_net_predictions - y1)^2)

# Calculate Root Mean Squared Error (RMSE)
rmse_elastic_net <- sqrt(mse_elastic_net)

# Calculate R-squared (R²)
r_squared_elastic_net <- 1 - mse_elastic_net / var(y1)

# Print or display the metrics
cat("Elastic Net Mean Squared Error (MSE):", mse_elastic_net, "\n")
cat("Elastic Net Root Mean Squared Error (RMSE):", rmse_elastic_net, "\n")
cat("Elastic Net R-squared (R²):", r_squared_elastic_net, "\n")


[](http://)

# Simple Logestic Regression 

In [None]:
library(stats)
# Assuming data1 is your dataframe
y <- df_1$shares  # Create a y dataset containing the "shares" column
x <- df_1[,c(
    "n_tokens_title", 
    "n_unique_tokens", 
    "n_non_stop_unique_tokens", 
    "num_hrefs", 
    "num_self_hrefs", 
    "num_imgs", 
    "num_videos", 
    "average_token_length", 
    "num_keywords", 
    "data_channel_is_lifestyle", 
    "data_channel_is_entertainment", 
    "data_channel_is_bus", 
    "data_channel_is_socmed", 
    "data_channel_is_tech", 
    "data_channel_is_world", 
    "kw_min_min", 
    "kw_max_min", 
    "kw_max_max", 
    "self_reference_min_shares", 
    "self_reference_max_shares", 
    "weekday_is_monday", 
    "weekday_is_tuesday", 
    "weekday_is_wednesday", 
    "weekday_is_thursday", 
    "weekday_is_friday", 
    "global_subjectivity", 
    "min_positive_polarity", 
    "avg_negative_polarity", 
    "title_subjectivity", 
    "title_sentiment_polarity", 
    "abs_title_subjectivity", 
    "abs_title_sentiment_polarity"
)]
# Categorizing 'shares' into a binary variable for y
y <- ifelse(y > 1400, 1, 0)

# Combine x and y into a single dataframe
data <- cbind(x, high_engagement = y)

# Splitting data into training and testing sets
set.seed(123) # for reproducibility
splitIndex <- createDataPartition(data$high_engagement, p = .80, list = FALSE, times = 1)
trainData <- data[splitIndex,]
testData <- data[-splitIndex,]

# Fitting the logistic regression model
model <- glm(high_engagement ~ ., data = trainData, family = "binomial")

# Summarizing the model
summary(model)

# Making predictions on the test set
predictions <- predict(model, newdata = testData, type = "response")
predictions <- ifelse(predictions > 0.5, 1, 0)

# Evaluating model performance
confusionMatrix <- table(Predicted = predictions, Actual = testData$high_engagement)
print(confusionMatrix)


In [None]:
# Assuming confusionMatrix is your confusion matrix

# Extract counts from the confusion matrix
true_positives <- confusionMatrix[2, 2]
true_negatives <- confusionMatrix[1, 1]
false_positives <- confusionMatrix[1, 2]
false_negatives <- confusionMatrix[2, 1]

# Calculating Metrics
accuracy <- (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
precision <- true_positives / (true_positives + false_positives)
recall <- true_positives / (true_positives + false_negatives)
f1_score <- 2 * (precision * recall) / (precision + recall)

# Handling potential NaN values due to division by zero
if (is.nan(precision)) {
  precision <- 0
}
if (is.nan(recall)) {
  recall <- 0
}
if (is.nan(f1_score)) {
  f1_score <- 0
}

# Printing the Metrics
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("Recall:", recall))
print(paste("F1 Score:", f1_score))


In [None]:
# Assuming you have these variables
actuals <- testData$high_engagement
predicted_probs <- predict(model, newdata = testData, type = "response")

# Binarize predicted probabilities for calculating MAE
predicted_binary <- ifelse(predicted_probs > 0.5, 1, 0)

# Mean Squared Error (MSE)
mse <- mean((actuals - predicted_probs) ^ 2)

# Root Mean Squared Error (RMSE)
rmse <- sqrt(mse)

# Mean Absolute Error (MAE)
mae <- mean(abs(actuals - predicted_binary))

# Printing the Metrics
print(paste("MSE:", mse))
print(paste("RMSE:", rmse))
print(paste("MAE:", mae))


In [None]:
# Load necessary libraries
library(stats)

# Assuming data1 is your dataframe
y <- df_1$shares  # Create a y dataset containing the "shares" column
x <- df_1[,c(
    "n_tokens_title", 
    "n_unique_tokens", 
    "n_non_stop_unique_tokens", 
    "num_hrefs", 
    "num_self_hrefs", 
    "num_imgs", 
    "num_videos", 
    "average_token_length", 
    "num_keywords", 
    "data_channel_is_lifestyle", 
    "data_channel_is_entertainment", 
    "data_channel_is_bus", 
    "data_channel_is_socmed", 
    "data_channel_is_tech", 
    "data_channel_is_world", 
    "kw_min_min", 
    "kw_max_min", 
    "kw_max_max", 
    "self_reference_min_shares", 
    "self_reference_max_shares", 
    "weekday_is_monday", 
    "weekday_is_tuesday", 
    "weekday_is_wednesday", 
    "weekday_is_thursday", 
    "weekday_is_friday", 
    "global_subjectivity", 
    "min_positive_polarity", 
    "avg_negative_polarity", 
    "title_subjectivity", 
    "title_sentiment_polarity", 
    "abs_title_subjectivity", 
    "abs_title_sentiment_polarity"
)]
# Categorizing 'shares' into a binary variable for y
y <- ifelse(y > 1400, 1, 0)

# Combine x and y into a single dataframe
data <- cbind(x, high_engagement = y)

# Set a random seed for reproducibility
set.seed(123)

# Split the data into training and testing sets
splitIndex <- createDataPartition(data$high_engagement, p = .70, list = FALSE, times = 1)
trainData <- data[splitIndex,]
testData <- data[-splitIndex,]

# Fit the logistic regression model on the training set
model <- glm(high_engagement ~ ., data = trainData, family = "binomial")

# Make predictions on the training set
train_predictions <- predict(model, newdata = trainData, type = "response")
train_predictions <- ifelse(train_predictions > 0.5, 1, 0)

# Evaluate training set performance
train_confusionMatrix <- table(Predicted = train_predictions, Actual = trainData$high_engagement)
print("Training Set Confusion Matrix:")
print(train_confusionMatrix)

# Make predictions on the test set
test_predictions <- predict(model, newdata = testData, type = "response")
test_predictions <- ifelse(test_predictions > 0.5, 1, 0)

# Evaluate test set performance
test_confusionMatrix <- table(Predicted = test_predictions, Actual = testData$high_engagement)
print("Test Set Confusion Matrix:")
print(test_confusionMatrix)


In [None]:
# Load necessary libraries
library(stats)
library(boot)  # Load the boot library for cv.glm

# Assuming data1 is your dataframe
y <- df_1$shares  # Create a y dataset containing the "shares" column
x <- df_1[,c(
    "n_tokens_title", 
    "n_unique_tokens", 
    "n_non_stop_unique_tokens", 
    "num_hrefs", 
    "num_self_hrefs", 
    "num_imgs", 
    "num_videos", 
    "average_token_length", 
    "num_keywords", 
    "data_channel_is_lifestyle", 
    "data_channel_is_entertainment", 
    "data_channel_is_bus", 
    "data_channel_is_socmed", 
    "data_channel_is_tech", 
    "data_channel_is_world", 
    "kw_min_min", 
    "kw_max_min", 
    "kw_max_max", 
    "self_reference_min_shares", 
    "self_reference_max_shares", 
    "weekday_is_monday", 
    "weekday_is_tuesday", 
    "weekday_is_wednesday", 
    "weekday_is_thursday", 
    "weekday_is_friday", 
    "global_subjectivity", 
    "min_positive_polarity", 
    "avg_negative_polarity", 
    "title_subjectivity", 
    "title_sentiment_polarity", 
    "abs_title_subjectivity", 
    "abs_title_sentiment_polarity"
)]
# Categorizing 'shares' into a binary variable for y
y <- ifelse(y > 1400, 1, 0)

# Combine x and y into a single dataframe
data <- cbind(x, high_engagement = y)

# Set a random seed for reproducibility
set.seed(123)

# Perform k-fold cross-validation (e.g., 5-fold)
k <- 5  # Number of folds
cv_results <- cv.glm(data, glmfit = glm(high_engagement ~ ., data = data, family = "binomial"), K = k)

# Print the cross-validation results
#print(cv_results)


In [None]:
library(pROC)

In [None]:
# Calculate and print CV metrics
cv_deviance <- cv_results$delta
cv_mean_deviance <- mean(cv_deviance)
print("Cross-Validation Metrics:")
print(paste("Mean Deviance:", cv_mean_deviance))

# Fit the logistic regression model on the training set
model <- glm(high_engagement ~ ., data = trainData, family = "binomial")

# Make predictions on the training set
train_predictions <- predict(model, newdata = trainData, type = "response")
train_predictions <- ifelse(train_predictions > 0.5, 1, 0)

# Evaluate training set performance
train_confusionMatrix <- table(Predicted = train_predictions, Actual = trainData$high_engagement)

# Calculate training set metrics
train_accuracy <- sum(diag(train_confusionMatrix)) / sum(train_confusionMatrix)
train_precision <- train_confusionMatrix[2, 2] / sum(train_confusionMatrix[, 2])
train_recall <- train_confusionMatrix[2, 2] / sum(train_confusionMatrix[2, ])
train_f1 <- 2 * (train_precision * train_recall) / (train_precision + train_recall)

# Calculate AUC-ROC for training set
train_roc <- roc(trainData$high_engagement, train_predictions)
train_auc <- auc(train_roc)

print("Training Set Metrics:")
print(paste("Accuracy:", train_accuracy))
print(paste("Precision:", train_precision))
print(paste("Recall:", train_recall))
print(paste("F1-Score:", train_f1))
print(paste("AUC-ROC:", train_auc))

# Make predictions on the test set
test_predictions <- predict(model, newdata = testData, type = "response")
test_predictions <- ifelse(test_predictions > 0.5, 1, 0)

# Evaluate test set performance
test_confusionMatrix <- table(Predicted = test_predictions, Actual = testData$high_engagement)

# Calculate test set metrics
test_accuracy <- sum(diag(test_confusionMatrix)) / sum(test_confusionMatrix)
test_precision <- test_confusionMatrix[2, 2] / sum(test_confusionMatrix[, 2])
test_recall <- test_confusionMatrix[2, 2] / sum(test_confusionMatrix[2, ])
test_f1 <- 2 * (test_precision * test_recall) / (test_precision + test_recall)

# Calculate AUC-ROC for test set
test_roc <- roc(testData$high_engagement, test_predictions)
test_auc <- auc(test_roc)

print("Test Set Metrics:")
print(paste("Accuracy:", test_accuracy))
print(paste("Precision:", test_precision))
print(paste("Recall:", test_recall))
print(paste("F1-Score:", test_f1))
print(paste("AUC-ROC:", test_auc))

In [None]:
# Ensure that you have the pROC package
# install.packages("pROC") # Uncomment this line if you haven't installed the package

# Load the pROC package
library(pROC)

# Assuming you have the actual values and predicted probabilities
# actuals <- testData$high_engagement
# predicted_probs <- predict(model, newdata = testData, type = "response")

# Generate the ROC object
roc_obj <- roc(actuals, predicted_probs)

# Plotting the ROC curve
plot(roc_obj, main="ROC Curve", col="#1c61b6", lwd=2)
abline(a=0, b=1, lty=2, col="red") # Adding a reference line
