In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, f_oneway, chi2_contingency
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Load the dataset
url = "https://raw.githubusercontent.com/HasanRoknabady/dataset-popularity-/main/OnlineNewsPopularity.csv"
df = pd.read_csv(url)

# Extract the relevant features
X = df.iloc[:, 2:-2].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Test for difference in means between two groups
short_shares = X_train[X_train[:, 2] < X_train[:, 2].mean()][:, -1]
long_shares = X_train[X_train[:, 2] >= X_train[:, 2].mean()][:, -1]
t_stat, p_value = ttest_ind(short_shares, long_shares)
print(f"t-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# Test for difference in means between more than two groups
popularities = [X_train[X_train[:, 2] < 1000][:, -1],
                X_train[(X_train[:, 2] >= 1000) & (X_train[:, 2] < 10000)][:, -1],
                X_train[(X_train[:, 2] >= 10000) & (X_train[:, 2] < 100000)][:, -1],
                X_train[X_train[:, 2] >= 100000][:, -1]]
f_stat, p_value = f_oneway(*popularities)
print(f"F-statistic: {f_stat:.3f}, p-value: {p_value:.3f}")

# Test for association between two categorical variables
avg_length = X_train[:, 2].mean()
short_sentiment = X_train[X_train[:, 2] < avg_length][:, -2]
long_sentiment = X_train[X_train[:, 2] >= avg_length][:, -2]
contingency_table = [[sum(short_sentiment == 0), sum(short_sentiment == 1)],
                     [sum(long_sentiment == 0), sum(long_sentiment == 1)]]
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"chi-square statistic: {chi2_stat:.3f}, p-value: {p_value:.3f}")

# Test for effect of scaling on t-test results
scalers = [('StandardScaler', StandardScaler()),
           ('MinMaxScaler', MinMaxScaler()),
           ('RobustScaler', RobustScaler())]

for scaler_name, scaler in scalers:
    X_train_scaled = scaler.fit_transform(X_train)
    short_shares = X_train_scaled[X_train_scaled[:, 2] < X_train_scaled[:, 2].mean()][:, -1]
    long_shares = X_train_scaled[X_train_scaled[:, 2] >= X_train_scaled[:, 2].mean()][:, -1]
    t_stat, p_value = ttest_ind(short_shares, long_shares)
    print(f"{scaler_name} - t-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")


t-statistic: -0.352, p-value: 0.725
F-statistic: nan, p-value: nan
chi-square statistic: 3.795, p-value: 0.051
StandardScaler - t-statistic: -0.352, p-value: 0.725
MinMaxScaler - t-statistic: -0.352, p-value: 0.725




RobustScaler - t-statistic: -0.352, p-value: 0.725
