In [13]:

# UPWORTHY EDITORIAL ASSISTANT - COMPLETE EDA TO MODELING PIPELINE
# Comprehensive analysis from data exploration to model building

import pandas as pd
import numpy as np
import requests
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")

# =================================================================
# 1. DATA ACQUISITION
# =================================================================

def download_upworthy_dataset():
    """Download the Upworthy Research Archive dataset."""
    
    data_dir = Path("upworthy_data")
    data_dir.mkdir(exist_ok=True)
    
    print("UPWORTHY RESEARCH ARCHIVE DATASET")
    print("=" * 50)
    print("Downloading 32,000+ headline A/B tests...")
    
df = pd.read_csv(r'upworthy_data\upworthy-archive-exploratory-packages-03.12.2020.csv')
display(df.head())
print(f"Dataset size: {len(df)} headlines")

display(df.shape)

Unnamed: 0.1,Unnamed: 0,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week
0,0,2014-11-20 06:43:16.005,2016-04-02 16:33:38.062,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3052,150,100.0,True,True,Anyone who's ever felt guilty about shopping a...,,201446
1,1,2014-11-20 06:43:44.646,2016-04-02 16:25:54.021,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3033,122,14.0,False,False,Walmart is getting schooled by another retaile...,,201446
2,2,2014-11-20 06:44:59.804,2016-04-02 16:25:54.024,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3092,110,1.8,False,False,Walmart may not be crapping their pants over t...,,201446
3,3,2014-11-20 06:54:36.335,2016-04-02 16:25:54.027,546d902c26714c6c44000039,Things that matter. Pass 'em on.,This Is What Sexism Against Men Sounds Like,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,546bc55335992b86c8000043,3526,90,4.1,False,False,"If you ever wondered, ""but what about the men?...",,201446
4,4,2014-11-20 06:54:57.878,2016-04-02 16:31:45.671,546d902c26714c6c44000039,Things that matter. Pass 'em on.,This Is What Sexism Against Men Sounds Like,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,546d900426714cd2dd00002e,3506,120,100.0,True,False,"If you ever wondered, ""but what about the men?...",,201446


Dataset size: 22666 headlines


(22666, 17)

In [14]:
# =================================================================
# 2. BASIC DATA EXPLORATION
# =================================================================

print("\n" + "="*60)
print("BASIC DATA EXPLORATION")
print("="*60)

# Dataset overview
print(f"Dataset shape: {df.shape}")
memory_usage = df.memory_usage(deep=True).sum() / 1024**2
display(f"Memory usage: {memory_usage:.2f} MB")

# Display first few rows
print("\nFirst few rows:")
display(df.head())

# Dataset info
print("\nDataset info:")
display(df.info())

# Column types analysis
display("\nColumn types breakdown:")
dtype_counts = df.dtypes.value_counts()
for dtype in dtype_counts.index:
    cols = df.select_dtypes(include=[dtype]).columns.tolist()
    print(f"{dtype}: {len(cols)} columns")
    display_cols = cols[:5] if len(cols) > 10 else cols
    remaining = len(cols) - 5 if len(cols) > 10 else 0
    print(f"  {display_cols}{'... (and ' + str(remaining) + ' more)' if remaining > 0 else ''}")

# Basic statistics
print("\nBasic statistics:")
display(df.describe(include='all'))


BASIC DATA EXPLORATION
Dataset shape: (22666, 17)


'Memory usage: 25.04 MB'


First few rows:


Unnamed: 0.1,Unnamed: 0,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week
0,0,2014-11-20 06:43:16.005,2016-04-02 16:33:38.062,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3052,150,100.0,True,True,Anyone who's ever felt guilty about shopping a...,,201446
1,1,2014-11-20 06:43:44.646,2016-04-02 16:25:54.021,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3033,122,14.0,False,False,Walmart is getting schooled by another retaile...,,201446
2,2,2014-11-20 06:44:59.804,2016-04-02 16:25:54.024,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3092,110,1.8,False,False,Walmart may not be crapping their pants over t...,,201446
3,3,2014-11-20 06:54:36.335,2016-04-02 16:25:54.027,546d902c26714c6c44000039,Things that matter. Pass 'em on.,This Is What Sexism Against Men Sounds Like,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,546bc55335992b86c8000043,3526,90,4.1,False,False,"If you ever wondered, ""but what about the men?...",,201446
4,4,2014-11-20 06:54:57.878,2016-04-02 16:31:45.671,546d902c26714c6c44000039,Things that matter. Pass 'em on.,This Is What Sexism Against Men Sounds Like,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,546d900426714cd2dd00002e,3506,120,100.0,True,False,"If you ever wondered, ""but what about the men?...",,201446



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22666 entries, 0 to 22665
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            22666 non-null  int64  
 1   created_at            22666 non-null  object 
 2   updated_at            22666 non-null  object 
 3   clickability_test_id  22666 non-null  object 
 4   excerpt               20249 non-null  object 
 5   headline              22666 non-null  object 
 6   lede                  22654 non-null  object 
 7   slug                  22666 non-null  object 
 8   eyecatcher_id         22644 non-null  object 
 9   impressions           22666 non-null  int64  
 10  clicks                22666 non-null  int64  
 11  significance          22666 non-null  float64
 12  first_place           22666 non-null  bool   
 13  winner                22666 non-null  bool   
 14  share_text            3208 non-null   object 
 15  squa

None

'\nColumn types breakdown:'

object: 10 columns
  ['created_at', 'updated_at', 'clickability_test_id', 'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'share_text', 'square']
int64: 4 columns
  ['Unnamed: 0', 'impressions', 'clicks', 'test_week']
bool: 2 columns
  ['first_place', 'winner']
float64: 1 columns
  ['significance']

Basic statistics:


Unnamed: 0.1,Unnamed: 0,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week
count,22666.0,22666,22666,22666,20249,22666,22654,22666,22644,22666.0,22666.0,22666.0,22666,22666,3208,7446,22666.0
unique,,22665,22666,4873,3507,12387,4309,22334,9678,,,,2,2,1978,2777,
top,,2013-03-27 19:45:06,2016-04-02 16:25:54.018,53480581be4154443a000008,Things that matter. Pass 'em on.,They Put 2 Guys In A Room And Made Them Argue ...,"<p>News cycles are depressing, but this dude, ...",the-highly-compelling-case-for-wealth-redistri...,5332f852a2fc90ead00013ee,,,,False,False,"""It isn't cute. It isn't funny. You've talked ...",thumb-2.jpg,
freq,,2,1,14,11396,22,39,4,27,,,,17843,21514,26,60,
mean,75219.08537,,,,,,,,,3574.700035,54.319598,40.733826,,,,,201417.975382
std,43728.705113,,,,,,,,,1437.94853,46.839176,39.670098,,,,,56.344639
min,0.0,,,,,,,,,13.0,0.0,0.0,,,,,201303.0
25%,37068.25,,,,,,,,,2740.0,24.0,2.8,,,,,201402.0
50%,74987.5,,,,,,,,,3122.0,42.0,25.7,,,,,201430.0
75%,112952.75,,,,,,,,,4091.0,70.0,85.4,,,,,201444.0
