# Data extraction 


In [4]:
# Importing necessary libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests, os
from time import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
start = time()
# Open the file input.xlsx and iterate through each row
file = pd.read_excel('input.xlsx')

In [6]:
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  114 non-null    int64 
 1   URL     114 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB


In [7]:
for index, row in file.iterrows():
    URL_ID, URL = row['URL_ID'], row['URL']
    # Get the response from the URL
    resp = requests.get(URL, timeout=5)
    if resp.status_code != 200:
        print(f'Error for URL_ID: {URL_ID} with status code: {resp.status_code}')
        continue
    
    # Parse the response using BeautifulSoup
    soup = BeautifulSoup(resp.content, 'html.parser')
    
    # Get the h1 of the article
    title = soup.find('h1').text.strip()
    # print(f'Title: {title}')
    
    # Find the main article text in the <div> tag with class 'tdb-block-inner td-fix-index' with all the <p> tags
    
    article_text = ''
    for p in soup.find_all('p'):
        # check if the <p> tag is inside the <div> tag with class 'tdb-block-inner td-fix-index' or 'td-post-content tagdiv-type'
        if p.parent.name == 'div':
            if p.parent.get('class') == ['tdb-block-inner', 'td-fix-index'] or p.parent.get('class') == ['td-post-content', 'tagdiv-type']:
                article_text += p.text.strip() + '\n'
        
    # print(f'Article Text: {article_text}')
    print(f'URL_ID: {URL_ID}, words: {len(article_text.split())}')
        
    # create directory 'articles' if it doesn't exist
    if not os.path.exists('articles'):
        os.makedirs('articles')
    # save the article text in a file with the name as URL_ID
    with open(f'articles/{URL_ID}.txt', 'w', encoding='utf-8') as file:
        file.write(f'Title: {title}\n\n{article_text}')
        print('txt file created with URL ID:',title)
    

# Time taken in minutes
print(f'Time taken: {(time() - start) / 60} min')

URL_ID: 37, words: 1773
txt file created with URL ID: AI in healthcare to Improve Patient Outcomes
URL_ID: 38, words: 1401
txt file created with URL ID: What if the Creation is Taking Over the Creator?
URL_ID: 39, words: 1684
txt file created with URL ID: What Jobs Will Robots Take From Humans in The Future?
URL_ID: 40, words: 1588
txt file created with URL ID: Will Machine Replace The Human in the Future of Work?
URL_ID: 41, words: 1690
txt file created with URL ID: Will AI Replace Us or Work With Us?
URL_ID: 42, words: 1232
txt file created with URL ID: Will machine replace the human in the future of work?
URL_ID: 43, words: 726
txt file created with URL ID: How humans and machines are evolving to work together?
Error for URL_ID: 44 with status code: 404
URL_ID: 45, words: 698
txt file created with URL ID: How machine learning will affect your business?
URL_ID: 46, words: 2148
txt file created with URL ID: Deep learning impact on areas of e-learning?
URL_ID: 47, words: 1835
txt file 

URL_ID: 115, words: 1497
txt file created with URL ID: COVID-19: How have countries been responding?
URL_ID: 116, words: 869
txt file created with URL ID: How will COVID-19 affect the world of work?
URL_ID: 117, words: 1730
txt file created with URL ID: Lessons from the past: Some key learnings relevant to the coronavirus crisis
URL_ID: 118, words: 1238
txt file created with URL ID: Lessons from the past: Some key learnings relevant to the coronavirus crisis
URL_ID: 119, words: 329
txt file created with URL ID: Coronavirus: Impact on the Hospitality Industry
URL_ID: 120, words: 2134
txt file created with URL ID: Why scams like Nirav Modi Happen with Indian banks?
URL_ID: 121, words: 1591
txt file created with URL ID: IMPACT OF COVID-19 ON THE GLOBAL ECONOMY
URL_ID: 122, words: 1694
txt file created with URL ID: Impact of coronavirus on the Indian economy
URL_ID: 123, words: 1800
txt file created with URL ID: Global Economy effected by Coronavirus
URL_ID: 124, words: 999
txt file create

# Data Analysis

In [13]:
import pandas as pd
import nltk
import spacy
from textblob import TextBlob
import pyphen
import statistics

# Load the input data
input_data = pd.read_excel("input.xlsx")

# Perform text analysis
for index, row in input_data.iterrows():
    try:
        url_id = row["URL_ID"]
        file_path = f"articles/{url_id}.txt"
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        # Positive score
        positive_score = TextBlob(text).sentiment.polarity
        input_data.at[index, "Positive Score"] = positive_score

        # Negative score
        negative_score = TextBlob(text).sentiment.subjectivity
        input_data.at[index, "Negative Score"] = negative_score

        # Polarity score
        polarity_score = TextBlob(text).sentiment.polarity
        input_data.at[index, "Polarity Score"] = polarity_score

        # Subjectivity score
        subjectivity_score = TextBlob(text).sentiment.subjectivity
        input_data.at[index, "Subjectivity Score"] = subjectivity_score

        # Average sentence length
        sentence_lengths = [len(sentence) for sentence in nltk.sent_tokenize(text)]
        average_sentence_length = statistics.mean(sentence_lengths)
        input_data.at[index, "Average Sentence Length"] = average_sentence_length

        # Tokenize the text
        tokens = nltk.word_tokenize(text)

        # Percentage of complex words
        percentage_of_complex_words = len(nltk.pos_tag(tokens)) / len(tokens)
        input_data.at[index, "Percentage of Complex Words"] = percentage_of_complex_words

        # Fog index
        fog_index = (20 * (1 + (percentage_of_complex_words / 5))) - (1.8 * average_sentence_length)
        input_data.at[index, "Fog Index"] = fog_index

        # Average number of words per sentence
        average_number_of_words_per_sentence = len(tokens) / len(nltk.sent_tokenize(text))
        input_data.at[index, "Average Number of Words Per Sentence"] = average_number_of_words_per_sentence

        # Complex word count
        complex_word_count = len(nltk.pos_tag(tokens))
        input_data.at[index, "Complex Word Count"] = complex_word_count

        # Word count
        word_count = len(tokens)
        input_data.at[index, "Word Count"] = word_count

        # Syllable per word
        dic = pyphen.Pyphen(lang='en')
        syllable_count = 0
        word_count = 0

        for word in tokens:
            syllables = dic.inserted(word).count('-') + 1
            syllable_count += syllables
            word_count += 1

        syllable_per_word = syllable_count / word_count
        input_data.at[index, "Syllable per word"] = syllable_per_word

        # Personal pronouns
        nlp = spacy.load('en_core_web_sm')
        doc = nlp(text)
        personal_pronouns = 0

        for token in doc:
            if token.pos_ == 'PRON' and token.text.lower() in ['i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself']:
                personal_pronouns += 1

        input_data.at[index, "Personal Pronouns"] = personal_pronouns

        # Average word length
        average_word_length = sum(len(word) for word in tokens) / len(tokens)
        input_data.at[index, "Average Word Length"] = average_word_length

    except FileNotFoundError:
        print(f"File not found for URL ID: {url_id}. Skipping iteration.")



File not found for URL ID: 44. Skipping iteration.
File not found for URL ID: 57. Skipping iteration.
File not found for URL ID: 144. Skipping iteration.


In [16]:
# Write the output data to an Excel file
output_data = input_data[["URL_ID", "Positive Score", "Negative Score", "Polarity Score",
                          "Subjectivity Score", "Average Sentence Length", "Percentage of Complex Words", "Fog Index",
                          "Average Number of Words Per Sentence", "Complex Word Count", "Word Count", "Syllable per word",
                          "Personal Pronouns", "Average Word Length"]]
output_data.to_excel("output_file.xlsx", index=False)
