In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import time

def scrape_npr_with_tags(url, num_articles=100):
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome()
    driver.get(url)

    for _ in range(num_articles // 10):
        driver.find_element(By.XPATH, '//body').send_keys(Keys.END)
        time.sleep(2)  # Allow time for content to load

    # Wait for the page to fully load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//article')))

    # Get the HTML content after dynamic loading
    page_source = driver.page_source

    # Close the WebDriver
    driver.quit()

    # Parse the HTML content of the page
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the elements containing news articles
    article_elements = soup.find_all('article')[:num_articles]
    df = []
    try:
    # Extract and print the headlines, summaries, tags, and sections
        for article in article_elements:
            headline = article.find('h2', class_='title').get_text(strip=True) if article.find('h2') else "N/A"
            summary = article.find('p', class_='teaser').get_text(strip=True) if article.find('p') else "N/A"
            sections = article.find('h3', class_='slug').get_text(strip=True) if article.find('h3') else "N/A"

            list1 = []
            if headline != 'N/A':
                list1.append(sections)
                list1.append(headline)
                list1.append(summary)
                df.append(list1)
    except Exception:
        pass

    return df


# Text Classification Model
def classify_news_articles(scraped_data):
    if len(scraped_data) < 2:
        print("Insufficient data for training and testing. Please scrape more articles.")
        return None, None, None

    # Convert the list to a DataFrame for easier handling
    scraped_df = pd.DataFrame(scraped_data, columns=['section', 'headline', 'summary'])

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        scraped_df['summary'],
        scraped_df['section'],
    
        test_size=0.1,
        random_state=42
    )

    if len(X_train) == 0 or len(X_test) == 0:
        print("Insufficient data for training and testing. Please scrape more articles.")
        return None, None, None

    # Continue with the rest of the classification process
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    classifier = MultinomialNB()
    classifier.fit(X_train_tfidf, y_train)
    y_pred = classifier.predict(X_test_tfidf)

    report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    evaluation_report = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

    return evaluation_report, accuracy, report


# Main script
def main():
    count=1
    # URL for news website
    section_list=['sections/news/','sections/national/','sections/world/','sections/climate/','sections/health/']
    links_list=[] 
    for link1 in section_list:
        url=f"https://www.npr.org/{link1}"
        r= requests.get(url)
        htmlcontent = r.text



        soup = BeautifulSoup(htmlcontent, 'html.parser')


        links = soup.find_all('a')


        for link in links:
            href = link.get('href')
            url_link=f"https://www.npr.org/{href}"
            if url_link not in links_list:
                links_list.append(url_link)
    links_list
    for l in section_list:
        npr_url = f"https://www.npr.org/{l}"

        # Scrape news articles
        scraped_data = scrape_npr_with_tags(npr_url, num_articles=100)

        if scraped_data is not None:
            # Store the scraped data in the project folder
            scraped_df = pd.DataFrame(scraped_data, columns=['section', 'headline', 'summary'])
            scraped_df.to_csv(f'scraped_data{count}.csv', index=False)
            count+=1



if __name__ == "__main__":
    main()


In [23]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

file_path = 'scraped_data.xlsx'
df = pd.read_excel(file_path)

df = df.dropna()
# Display the first few rows of the DataFrame
print(df.head())

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [ps.stem(word.lower()) for word in tokens if (word.isalpha() and word.lower() not in stop_words)]
    return ' '.join(tokens)

df['processed_summary'] = df['summary'].apply(preprocess_text)


print(df.head())


X_train, X_test, y_train, y_test = train_test_split(df['processed_summary'], df['section'], test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



# SVC model
classifier = SVC(kernel='linear')  # You can experiment with different kernels

classifier.fit(X_train_tfidf, y_train)

# predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, y_pred))

                            section  \
0                          Business   
1  Middle East crisis â€” explained   
2            The Unmarked Graveyard   
3                       Middle East   
4  Middle East crisis â€” explained   

                                            headline  \
0  OpenAI reinstates Sam Altman as its chief exec...   
1  Qatar says Israel and Hamas agree to a cease-f...   
2  He disappeared in 1995. His mother's search le...   
3                   Middle East crisis â€” explained   
4  Pause in war gives hope to families of Israeli...   

                                             summary  
0  November 22, 2023 â€¢The company, maker of the...  
1  November 21, 2023 â€¢Qatar's foreign ministry ...  
2  November 22, 2023 â€¢LaMont Dottin was a fresh...  
3  November 22, 2023 â€¢The conflict between Isra...  
4  November 22, 2023 â€¢Hamas and Israel agreed t...  
                            section  \
0                          Business   
1  Middle East crisi

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sanket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sanket\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
