In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, pearsonr, chi2_contingency, f_oneway

data = pd.read_csv('https://raw.githubusercontent.com/HasanRoknabady/dataset-popularity-/main/OnlineNewsPopularity.csv')

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Test for difference in means between two groups
weekday_shares = X_train[y_train == 0][:, -1]
weekend_shares = X_train[y_train == 1][:, -1]

t_stat, p_value = ttest_ind(weekday_shares, weekend_shares)

print(f"t-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# Test for correlation between two variables
num_imgs = X_train[:, 3]
num_videos = X_train[:, 4]

corr_coef, p_value = pearsonr(num_imgs, num_videos)

print(f"Pearson correlation coefficient: {corr_coef:.3f}, p-value: {p_value:.3f}")

# Test for difference in proportions of sentiment (positive/negative) between articles with short and long titles
avg_length = np.mean(X_train[:, 2])
short_sentiment = X_train[X_train[:, 2] < avg_length][:, -2]
long_sentiment = X_train[X_train[:, 2] >= avg_length][:, -2]

contingency_table = [[sum(short_sentiment == 0), sum(short_sentiment == 1)],
                     [sum(long_sentiment == 0), sum(long_sentiment == 1)]]

# add a small constant to each cell in the contingency table
contingency_table = np.array(contingency_table) + 0.5

chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"chi-square statistic: {chi2_stat:.3f}, p-value: {p_value:.3f}")


# Test for difference in means between more than two groups
lifestyle_shares = X_train[X_train[:, 11] == 1][:, -1]
entertainment_shares = X_train[X_train[:, 12] == 1][:, -1]
business_shares = X_train[X_train[:, 13] == 1][:, -1]
social_media_shares = X_train[X_train[:, 14] == 1][:, -1]

f_stat, p_value = f_oneway(lifestyle_shares, entertainment_shares, business_shares, social_media_shares)

print(f"F-statistic: {f_stat:.3f}, p-value: {p_value:.3f}")


# Test for correlation between two categorical variables
weekday_category = X_train[y_train == 0][:, 10]
weekend_category = X_train[y_train == 1][:, 10]

contingency_table = pd.crosstab(weekday_category, weekend_category)

chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"chi-square statistic: {chi2_stat:.3f}, p-value: {p_value:.3f}")



t-statistic: nan, p-value: nan
Pearson correlation coefficient: -0.002, p-value: 0.729


NameError: name 'np' is not defined