In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os

In [None]:
#Function for extracting text from URL
def text_from_URL(url):
  response = requests.get(url)
  response = response.content
  soup = BeautifulSoup(response,'html.parser')
  title = soup.title.text.strip()
  text_div = soup.find('div', class_='td-post-content tagdiv-type') or \
             soup.find('div', class_='tdb-block-inner td-fix-index')

  text_with_pseudo_elements = ''
  for content in text_div.contents:
    text_with_pseudo_elements += str(content)

  text = BeautifulSoup(text_with_pseudo_elements,'html.parser').get_text(separator=' ').strip()
  return title,text

In [None]:
#Function for creating text files
def text_files(url_id,title,text):
  filename = f"{url_id}.text"
  with open(filename, 'w', encoding='utf-8') as file:
    file.write(f"Title: {title}\n\n{text}")

In [None]:
# Using Given Stopword files
def use_custom_stopwords():
  files = [
      'StopWords_Auditor.txt',
      'StopWords_Currencies.txt',
      'StopWords_DatesandNumbers.txt',
      'StopWords_Generic.txt',
      'StopWords_GenericLong.txt',
      'StopWords_Names.txt',
      'StopWords_Geographic.txt'
  ]
  custom_stopwords = set()

  for file in files:
    with open(file, 'r', encoding='ISO-8859-1') as nfile:
      custom_stopwords.update(nfile.read().splitlines())

  return custom_stopwords



In [None]:
# using given PosNeg files
def posNeg_words():
  pos_words = set()
  neg_words = set()

  with open('positive-words.txt', 'r', encoding='ISO-8859-1') as nfile1:
    pos_words.update(nfile1.read().splitlines())

  with open('negative-words.txt', 'r', encoding='ISO-8859-1') as nfile2:
    neg_words.update(nfile2.read().splitlines())

  return pos_words,neg_words

In [None]:
# for Syllable count
def count_syllables(word):
  count = 0
  vowels = "aeiouy"

  if word[0] in vowels:
    count += 1

  for i in range(1, len(word)):
    if word[i] in vowels and word[i-1] not in vowels:
      count += 1

  if word.endswith("e"):
    count -= 1

  if count == 0:
    count = 1

  return count

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.metrics import edit_distance
from textblob import TextBlob
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Analysis on Extracted Text files
def analysis(text):
  stop_words = set(stopwords.words("english"))
  custom_stopwords = use_custom_stopwords()
  stop_words.update(custom_stopwords)
  tokenizer = RegexpTokenizer(r'\w+')
  words = tokenizer.tokenize(text)

  sentences = sent_tokenize(text)

  # Word and sentence count
  word_count = len(words)
  sent_count = len(sentences)

  #+ve and -ve scores
  pos_words,neg_words = posNeg_words()
  pos_count = sum(word.lower() in pos_words for word in words)
  positive_score = pos_count / word_count * 100 if word_count > 0 else 0

  neg_count = sum(word.lower() in neg_words for word in words)
  negative_score = neg_count / word_count * 100 if word_count > 0 else 0

  # Polarity Score and subjectivity score
  blob = TextBlob(text)
  polarity_score = blob.sentiment.polarity
  subjectivity_score = blob.sentiment.subjectivity

  # Average Sentence length
  average_sentence_length = word_count / sent_count if sent_count > 0 else 0

  # Percentage of complex words
  complex_words = [word for word in words if len(word) > 6 and word.lower() not in stop_words]
  perc_complex_words = (len(complex_words) / len(words)) * 100 if len(words) > 0 else 0

  # Fog index
  fog_index = 0.4 * (average_sentence_length + perc_complex_words)

  # complex word count
  complex_word_count = len(complex_words)

  # Syllable per word
  syllables_per_word = sum(count_syllables(word) for word in words) / word_count if word_count > 0 else 0

  # Personal Pronouns
  personal_pronoun = re.compile(r'\b(?:I|we|my|mine|ours|us)\b', flags=re.IGNORECASE)
  personal_pronoun_count = len(re.findall(personal_pronoun, text))

  country_name = re.compile(r'\bUS\b', flags=re.IGNORECASE)
  country_name_count = len(re.findall(country_name, text))

  personal_pronoun_count -= country_name_count

  # Average word length
  average_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0

  return {
  'POSITIVE SCORE':positive_score,
  'NEGATIVE SCORE':negative_score,
  'POLARITY SCORE':polarity_score,
  'SUBJECTIVITY SCORE':subjectivity_score,
  'AVG SENTENCE LENGTH':average_sentence_length,
  'PERCENTAGE OF COMPLEX WORDS':perc_complex_words,
  'FOG INDEX':fog_index,
  'AVG NUMBER OF WORDS PER SENTENCE':average_sentence_length,
  'COMPLEX WORD COUNT':complex_word_count,
  'WORD COUNT':word_count,
  'SYLLABLE PER WORD':syllables_per_word,
  'PERSONAL PRONOUNS':personal_pronoun_count,
  'AVG WORD LENGTH':average_word_length
  }



In [None]:
def main():
  # Read URLs from input.xlsx
  df = pd.read_excel('Input.xlsx')
  output = pd.DataFrame(columns=df.columns)

  for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract text from the URL
    title, article_text = text_from_URL(url)

    # Save to text file
    text_files(url_id, title, article_text)

    # Analysis
    analysed_text = analysis(article_text)

    # Final Output
    output = output.append({**row, **analysed_text}, ignore_index = True)
    output.to_excel('Saurabh_Gadhave Output Data Structure.xlsx', index = False)

if __name__ == "__main__":
    main()

  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index = True)
  output = output.append({**row, **analysed_text}, ignore_index 