In [2]:
import requests
from bs4 import BeautifulSoup

url = "https://edition.cnn.com/"


response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    header_links = soup.find_all('a', class_='header__nav-item-link')

    headers = []

    for header in header_links:
        headers.append(header.text.strip())

    for header in headers:
        print(header)
else:
    print("Failed to retrieve the web page")



US
World
Politics
Business
Opinion
Health
Entertainment
Style
Travel
Sports
Video
More


In [3]:
headers = headers[:-1]

In [4]:
headers1 = headers[:-1]

In [5]:
headers2 = [header.lower() for header in headers1]
print(headers2)


['us', 'world', 'politics', 'business', 'opinion', 'health', 'entertainment', 'style', 'travel', 'sports']


In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://edition.cnn.com/"
base_url1 = "https://edition.cnn.com" #remove last slash from base_url for scraping headers and jumping to that page to scrape articles
url = base_url

all_headlines = []

for header in headers2:
    full_url = base_url + header
    response = requests.get(full_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        parent_elements = soup.find_all('a', class_="container__link container__link--type-article container_lead-plus-headlines__link")  # Adjust class as needed

        for parent_element in parent_elements:
            headline_text = parent_element.text.strip()

            url = parent_element.get('href')

            if not url.startswith('http'):
                url = base_url1 + url

            content_response = requests.get(url)

            if content_response.status_code == 200:
                content_soup = BeautifulSoup(content_response.text, 'html.parser')

                content_elements = content_soup.find_all('p')
                content_text = ' '.join([p.text for p in content_elements])

                content_text = content_text.replace('\n', ' ')

                all_headlines.append((header, headline_text, url, content_text))
            else:
                print(f"Failed to retrieve content for {url}")
    else:
        print(f"Failed to retrieve the web page for {header}")

df = pd.DataFrame(all_headlines, columns=["Header", "Headline", "URL", "Content"])

print(df)


Failed to retrieve content for https://edition.cnn.com/cnn-underscored/travel/united-elite-status-changes
Failed to retrieve content for https://edition.cnn.com/cnn-underscored/travel/away-holiday-collection-launch
    Header                                           Headline  \
0       us  Mohammad Zanoun/Middle East Images/AFP/Getty I...   
1       us  5 things to know for Nov. 10: Israel, Senate, ...   
2       us  Federal jury convicts former Baltimore prosecu...   
3       us  NYPD investigating potential ‘bias incident’ a...   
4       us  Man is convicted of murder in deaths of 2 Dall...   
..     ...                                                ...   
213  style  Nelson Mandela, the family man: New book offer...   
214  style                                 Ben Birchall/PA/AP   
215  style  Why an artist turned 6,000 unwanted copies of ...   
216  style  ‘A masterpiece rediscovered’: Unseen Monet pai...   
217  style        Egypt’s pyramids host stunning works of art   

    

In [8]:
df['Content'].iloc[1]

'       A sprawling 61-acre site in Maryland was selected as the new location for the FBI’s headquarters, but some of the bureau’s leaders and state lawmakers are expressing concerns over the choice.           Here’s what else you need to know to\xa0Get Up to Speed and On with Your Day.     If your day doesn’t start until you’re up to speed on the latest headlines, then let us introduce you to your new favorite morning fix. Sign up here for the ‘5 Things’ newsletter.         Israeli forces have intensified their ground and air campaign on Gaza in recent days, with officials claiming to have reached “the heart” of the besieged enclave’s main city. This comes as Israel has agreed to daily pauses of military operations in parts of northern Gaza, the White House said, allowing people to travel for aid and relief. On Thursday alone, around 80,000 people fled northern Gaza through an evacuation corridor, an Israeli official said, compared with 50,000 people Wednesday. While CNN cannot indepe

In [21]:
df['URL'].iloc[1]

'https://edition.cnn.com/2023/11/10/us/5-things-to-know-for-nov-10-israel-senate-suspicious-letters-mortgage-rates-school-vaccines/index.html'

In [9]:
df.to_csv('news_data.csv', index=False)


In [10]:
df.shape

(218, 4)

In [14]:
df['Header'].value_counts()

world       75
business    35
us          28
health      25
politics    24
opinion     19
style       12
Name: Header, dtype: int64

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


labels = df['Header']

X = df[['Content', 'Headline']]
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = vectorizer.fit_transform(X_train['Content'] + ' ' + X_train['Headline'])

X_test_tfidf = vectorizer.transform(X_test['Content'] + ' ' + X_test['Headline'])

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 2.0]
}

param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

param_grid_lr = {
    'C': [0.1, 1, 10],
    'max_iter': [50, 100, 200],
    'solver': ['liblinear', 'lbfgs']
}


classifiers = {
    'RandomForest': (RandomForestClassifier(), param_grid_rf),
    'NaiveBayes': (MultinomialNB(), param_grid_nb),
    'SVM': (SVC(), param_grid_svc),
    'LogisticRegression': (LogisticRegression(), param_grid_lr)
}

for clf_name, (clf, param_grid) in classifiers.items():
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_tfidf, y_train)

    print(f"Best Parameters for {clf_name}: ", grid_search.best_params_)
    print(f"Best Accuracy for {clf_name}: ", grid_search.best_score_)

    best_classifier = grid_search.best_estimator_

    X_test_tfidf = vectorizer.transform(X_test['Content'] + ' ' + X_test['Headline'])

    y_pred = best_classifier.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f'Accuracy for {clf_name}: {accuracy}')
    print(classification_rep)
    print("\n")


Best Parameters for RandomForest:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best Accuracy for RandomForest:  0.6722689075630253
Accuracy for RandomForest: 0.7954545454545454
              precision    recall  f1-score   support

    business       0.89      1.00      0.94         8
      health       1.00      0.80      0.89         5
     opinion       1.00      0.67      0.80         6
    politics       0.67      0.80      0.73         5
       style       1.00      0.75      0.86         4
          us       1.00      0.20      0.33         5
       world       0.65      1.00      0.79        11

    accuracy                           0.80        44
   macro avg       0.89      0.75      0.76        44
weighted avg       0.85      0.80      0.78        44



Best Parameters for NaiveBayes:  {'alpha': 0.1}
Best Accuracy for NaiveBayes:  0.6205042016806723
Accuracy for NaiveBayes: 0.6818181818181818
              precision    recall  f1-s

In [28]:
best_classifier = grid_search.best_estimator_

y_pred_best = best_classifier.predict(X_test_tfidf)

accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)

results_df_best = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_best
})

results_df_best.to_excel('best_model_testing_evaluation.xlsx', index=False)

print(f'Accuracy for Best Model: {accuracy_best}')
print(classification_rep_best)

Accuracy for Best Model: 0.7954545454545454
              precision    recall  f1-score   support

    business       0.89      1.00      0.94         8
      health       0.83      1.00      0.91         5
     opinion       1.00      0.50      0.67         6
    politics       0.67      0.80      0.73         5
       style       1.00      0.75      0.86         4
          us       1.00      0.20      0.33         5
       world       0.69      1.00      0.81        11

    accuracy                           0.80        44
   macro avg       0.87      0.75      0.75        44
weighted avg       0.84      0.80      0.77        44



In [29]:
results_df_best

Unnamed: 0,Actual,Predicted
100,world,world
215,style,style
139,business,business
178,opinion,opinion
15,us,world
154,business,business
170,opinion,world
73,world,world
207,style,style
140,business,business
