## 1. Import Dependencies


In [None]:
import os
import time
import nltk
import requests
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')

## 2. Scrape Website

In [2]:
# Scrape: Extracting the Information from Web-site.
def scrape_website(url, label):

    # Mimic like a real browser by sending a User-Agent header (some websites block automated requests).
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5'
    }

    try:
        # Request Web-site.
        response = requests.get(url, headers=headers, timeout=45)

        if response.status_code != 200:
            print(f"Failed to load {url}. Status code: {response.status_code}")
            return 
    
        # Parse HTML-document.
        soup = BeautifulSoup(response.content, 'html.parser') 

        # Extract Information.
        title = soup.title.string if soup.title else "" 

        # Extract the meta description (if present).
        meta_description = "" 
        meta_tag = soup.find('meta', attrs={'name': 'description'}) 
        if meta_tag and 'content' in meta_tag.attrs:
            meta_description = meta_tag['content']

        # Paragraph Content.
        main_content = "".join([p.get_text(strip=True) for p in soup.find_all('p')])

        # Headers
        headers = "".join([h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])])

        
        return{
            'url': url,
            'title': title,
            'meta_description': meta_description,
            'content': main_content,
            'headers': headers,
            'category' : label
        }

    except requests.RequestException as e:
        print(f"Error scraping {url}: {e}")
        


## 3. Data Preprocessing

In [3]:
def preprocess_data(df):
    
    # Gather whole information(handling NaN)
    df['text'] = df['title'].fillna('') + " " + df['meta_description'].fillna('') + " " + df['content'].fillna('') + " " + df['headers'].fillna('') + " " + df['category'].fillna('')

    # Convert to lower-case.
    df['text'] = df['text'].str.lower()
    
    # Remove punctuation (.,?!-'").
    df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))

    return df

## 4. Extract Features

In [4]:
def extract_features(df):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=10000)
    features = vectorizer.fit_transform(df['text']) 
    
    return features

## 5. Train Model

In [5]:
# Train-Model
def train_model(features, labels):
    # Note: 
    # We are classifing websites based on text they contrain.
    # model = MultinomialNB() # Naive-bayers text classifier.
    model = RandomForestClassifier()
    # X_train, X_test, y_train, y_test = train_test_split(features, lables, test_size=0.2, random_state=42) 
    # 'stratify' makes sure that in test-set all categories have same same/mix propotion. So that they are fairly tested.
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42) 
    
    model.fit(X_train, y_train) 
    
    return model, X_test, y_test

## 6. Evaluate Model

In [6]:
# Evaluate model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))

## 7. Save and Load Scraped Data

In [7]:
def save_to_csv(df, filename):
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

In [8]:
def load_from_csv(filename):
    return pd.read_csv(filename)

In [21]:
output_file = 'scraped_data.csv'
    
if not os.path.exists(output_file):
    # Scrape and preprocess data
    df = pd.read_csv('data.csv')
    urls = df['url'].tolist()
    labels = df['category'].tolist()
    data = []
        
    for url, label in zip(urls, labels):
        website_data = scrape_website(url, label)
        if website_data:
            data.append(website_data)
        time.sleep(2)  # Add a delay between requests
        
    df = pd.DataFrame(data)
    # Shuffle-data
    df = df.sample(frac=1).reset_index(drop=True)
    # Data Pre_processing
    df = preprocess_data(df)
        
    # Save the scraped and preprocessed data
    save_to_csv(df, output_file)
else:
    # Load the previously scraped data
    print(f"Loading previously scraped data from {output_file}")
    df = load_from_csv(output_file)

# Extract features
features = extract_features(df)

# Train and evaluate the model
model, X_test, Y_test = train_model(features, df['category'])
evaluate_model(model, X_test, Y_test)

Loading previously scraped data from scraped_data.csv
               precision    recall  f1-score   support

   automotive       1.00      1.00      1.00         2
   e-commerce       1.00      1.00      1.00         4
    education       0.67      0.80      0.73         5
entertainment       0.60      1.00      0.75         3
      finance       1.00      0.50      0.67         2
 food & drink       0.83      1.00      0.91         5
       health       1.00      0.75      0.86         4
    lifestyle       1.00      0.75      0.86         4
         news       1.00      0.80      0.89         5
  real estate       1.00      1.00      1.00         3
       sports       1.00      1.00      1.00         5
   technology       0.75      0.75      0.75         4
       travel       1.00      1.00      1.00         2

     accuracy                           0.88        48
    macro avg       0.91      0.87      0.88        48
 weighted avg       0.90      0.88      0.88        48

