In [80]:
!pip install requests beautifulsoup4 pandas nltk textblob textstat openpyxl



In [79]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
from textstat import textstat
from google.colab import files
import os
import io
import re
import logging

logging.basicConfig(filename='error.log', level=logging.ERROR)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
# Upload and access the input.xlsx file
uploaded = files.upload()

file_name = next(iter(uploaded))
file_content = uploaded[file_name]
input_data = pd.read_excel(io.BytesIO(file_content))

Saving Input.xlsx to Input (3).xlsx


In [120]:
# Function to extract article text from URL
def extract_article_text(url):
    encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
    for encoding in encodings_to_try:
        try:
            response = requests.get(url)
            response.raise_for_status()
            decoded_content = response.content.decode(encoding)
            soup = BeautifulSoup(decoded_content, 'html.parser')

            # Remove unwanted elements
            for element in soup(['header', 'footer', 'script', 'style']):
                element.decompose()

            # Extract title
            title = soup.find('h1').get_text(strip=True) if soup.find('h1') else ''

            # Extract paragraphs in the article
            paragraphs = soup.find_all('p')
            text = ' '.join([p.get_text(strip=True) for p in paragraphs])

            # Additional cleaning
            text = re.sub(r'\s+', ' ', text).strip()

            return title, text
        except requests.exceptions.HTTPError as http_err:
            logging.error(f"HTTP error occurred for URL {url}: {http_err}")
            return None, None
        except UnicodeDecodeError:
            logging.error(f"Error decoding content from {url} with encoding {encoding}. Trying next encoding.")
            continue
        except Exception as e:
            logging.error(f"Error extracting data from {url} with encoding {encoding}: {e}")
    logging.error(f"Failed to extract data from {url} using any encoding")
    return None, None

In [122]:
# Function to beautify and save articles
def beautify_and_save_articles(data):
    if not os.path.exists('beautified_articles'):
        os.makedirs('beautified_articles')

    for article in data:
        url_id = article['URL_ID']
        title = article['Title']
        text = article['Text']

        # Center align the title
        formatted_title = f"{title.center(80)}\n\n"

        # Add a blank line after the title
        formatted_text = formatted_title + text.replace("\n", "\n\n")

        # Center align the end line info
        end_line_info = "Contact us: hello@blackcoffer.com © All Right Reserved, Blackcoffer(OPC) Pvt. Ltd"
        formatted_end_line = f"{end_line_info.center(80)}\n\n"

        # Add end line info with space
        formatted_text += formatted_end_line

        # Save the beautified article as TXT file
        with open(f"beautified_articles/{url_id}.txt", 'w', encoding='utf-8') as file:
            file.write(formatted_text)

        # Save the raw text article
        with open(f"articles/{url_id}.txt", 'w', encoding='utf-8') as file:
            file.write(f"{title}\n\n{text}")


In [111]:
# Function to clean text using stop words
def clean_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(cleaned_words)

In [121]:
# Function to perform text analysis
def text_analysis(text):
    try:
        cleaned_text = clean_text(text)
        sentences = sent_tokenize(cleaned_text)
        words = word_tokenize(cleaned_text)
        total_words = len(words)
        total_sentences = len(sentences)
        avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
        syllable_count = sum([textstat.syllable_count(word) for word in words])
        avg_word_length = sum(len(word) for word in words) / total_words if total_words > 0 else 0
        personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

        complex_word_count = sum(1 for word in words if textstat.syllable_count(word) >= 3)
        percentage_complex_words = (complex_word_count / total_words) * 100 if total_words > 0 else 0

        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

        # Perform sentiment analysis
        blob = TextBlob(text)
        polarity_score = blob.sentiment.polarity
        subjectivity_score = blob.sentiment.subjectivity

        return {
            'POSITIVE SCORE': polarity_score,
            'NEGATIVE SCORE': 1 - polarity_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
            'FOG INDEX': fog_index,
            'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
            'COMPLEX WORD COUNT': complex_word_count,
            'WORD COUNT': total_words,
            'SYLLABLE PER WORD': syllable_count / total_words if total_words > 0 else 0,
            'PERSONAL PRONOUNS': personal_pronouns,
            'AVG WORD LENGTH': avg_word_length
        }
    except Exception as e:
        logging.error(f"Error in text analysis: {e}")
        return {}

In [123]:
# Create directory for saving articles
if not os.path.exists('articles'):
    os.makedirs('articles')

# Create directory for saving beautified articles
if not os.path.exists('beautified_articles'):
    os.makedirs('beautified_articles')

# Extract data for each URL
extracted_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, text = extract_article_text(url)
    if title and text:
        extracted_data.append({'URL_ID': url_id, 'URL': url, 'Title': title, 'Text': text})

# Analyze text for each extracted article
analyzed_data = []
for data in extracted_data:
    try:
        analysis_result = text_analysis(data['Text'])
        merged_data = {**data, **analysis_result}  # Merge analysis results with original data
        analyzed_data.append(merged_data)
    except Exception as e:
        logging.error(f"Error analyzing data for URL {data['URL']}: {e}")

# Beautify and save articles
beautify_and_save_articles(analyzed_data)


ERROR:root:HTTP error occurred for URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
ERROR:root:HTTP error occurred for URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


In [124]:
# Check if all expected columns are present in analyzed_data
required_columns = ['URL_ID', 'URL', 'Title', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                    'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                    'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

# Convert to DataFrame with desired column order
output_df = pd.DataFrame(analyzed_data)
for column in required_columns:
    if column not in output_df.columns:
        output_df[column] = None

# Reorder dataFrame columns
output_df = output_df[required_columns]

# Save Excel file with specified sheet name
output_df.to_excel('Output Data Structure.xlsx', index=False)

# Download output file
files.download('Output Data Structure.xlsx')

# Download beautified text files
for article in extracted_data:
    url_id = article['URL_ID']
    files.download(f"beautified_articles/{url_id}.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>