# Data Extraction and Test Analysis for Websites

## Objective:

The objective of this assignment is to extract textual data articles from the given URL and perform text analysis to compute variables that are explained below.


In [None]:
!pip install pyphen

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# imports

import re
import os
import nltk
import shutil
import requests
import pyphen
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# paths

input_data_path = "/content/drive/MyDrive/intership project/Input.xlsx"
stopwords_path = "/content/drive/MyDrive/intership project/StopWords"
masterdict_path = "/content/drive/MyDrive/intership project/MasterDictionary"

## Step 1: Data Extraction

Input.xlsx
* For each of the articles, given in the input.xlsx file, extract the article text and save the extracted article in a text file with URL_ID as its file name.
* While extracting text, please make sure your program extracts only the article title and the article text. It should not extract the website header, footer, or anything other than the article text.


In [None]:
stopwords_files = os.listdir(stopwords_path)
masterdict_files = os.listdir(masterdict_path)

In [None]:
input_data = pd.read_excel(input_data_path)
input_data.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [None]:
class DataExtraction:
  def __init__(self, dataframe):

    self.dataframe = dataframe
    self.dataframe_length = len(dataframe)
    self.all_files = []


  def create_dir(self, dir_path):
    os.makedirs(dir_path)


  def delete_dir(self, dir_path):
    shutil.rmtree(dir_path)


  def converttostring(self, paragraphs):
    fulltext = []
    for para in paragraphs:
      para = para.get_text()
      if len(para)<=1:
        fulltext.append(para)
      else:
        sentence = para.split(" ")
        for word in sentence:
          fulltext.append(word)
    fulltext = " ".join(fulltext)

    return fulltext


  def extract_data_to_files(self, dir_path):

    if os.path.exists(dir_path):
      pass
    else:
      self.create_dir(dir_path = dir_path)


    for index in range(self.dataframe_length):

      URL_ID, URL = input_data.iloc[index].tolist()

      response = requests.get(URL)
      html_content = response.content
      soup = BeautifulSoup(html_content, "html.parser")

      title = soup.title.string.split(" ")
      title = title[:-3]
      title = " ".join(title)
      paragraphs = soup.findAll("p", attrs= lambda attrs: not attrs)

      fulltext = self.converttostring(paragraphs)
      fulltext = title + ". " + fulltext
      self.all_files.append(fulltext)

      with open(f"{dir_path}/{URL_ID}.txt", "a") as file:
        file.write(fulltext)

  def retrieve_files(self):
    updated_text_files = []
    for textfile in all_files:
      textfile = textfile.split()
      textfile = " ".join(textfile)
      updated_text_files.append(textfile)
    self.all_files = updated_text_files
    return self.all_files


In [None]:
project = DataExtraction(dataframe = input_data)

In [None]:
# project.delete_dir("DataExtracted_textfiles")

In [None]:
project.extract_data_to_files("DataExtracted_textfiles")

In [None]:
url = "https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/"

In [None]:
# excludes plus/minus symbol
def lower_and_strip_punctuation(text):
  text = text.lower()
  text = re.sub(r'[^\w\s+-]', '', text)
  return text

In [None]:
def read_file_to_list(file_path):
  with open(file_path, "r",encoding="ISO-8859-1") as file:
    content = file.readlines()
    content = [lower_and_strip_punctuation(line.strip()) for line in content]
    return content

In [None]:
# creating stopwords
custom_stopwords = []
def create_stopwords(stop_words_files):
  for file in stop_words_files:
    file_path = f"/content/drive/MyDrive/intership project/StopWords/{file}"
    file_contents = read_file_to_list(file_path)
    custom_stopwords.extend(file_contents)
create_stopwords(stopwords_files)

In [None]:
# creating positive and negative words
positive_words = read_file_to_list("/content/drive/MyDrive/intership project/MasterDictionary/positive-words.txt")
negative_words = read_file_to_list("/content/drive/MyDrive/intership project/MasterDictionary/negative-words.txt")

# adding the words that are not in stopwords list
positive_words = [word for word in positive_words if word not in custom_stopwords]
negative_words = [word for word in negative_words if word not in custom_stopwords]

In [None]:
# tokenize function
custom_stopwords = set(custom_stopwords)
def filterwords(text, stopwords = custom_stopwords):
  text = lower_and_strip_punctuation(text)
  tokenized_words = word_tokenize(text)
  filtered_words = [word for word in tokenized_words if word not in stopwords]
  return filtered_words

In [None]:
# to retrieve text files
all_files = project.retrieve_files()

In [None]:
# sentiment analysis
positive_score = []
negative_score = []
polarity_score = []
subjective_score = []
total_cleaned_words = []

for text in all_files:

  positive_count = 0
  negative_count = 0

  filteredwords = filterwords(text)
  total_words_after_cleaning = len(filteredwords)

  for word in filteredwords:
    if word in positive_words:
      positive_count += 1
    elif word in negative_words:
      negative_count += 1

  positive_score.append(positive_count)
  negative_score.append(negative_count)
  total_cleaned_words.append(total_words_after_cleaning)
  polarity_score.append((positive_count - negative_count) / ((positive_count + negative_count) + 0.000001))
  subjective_score.append((positive_count + negative_count) / ((total_words_after_cleaning) + 0.000001))

In [None]:
def average_sentence_length(text):

    sentences = text.split('. ')
    num_sentences = len(sentences)
    total_words = 0

    for sentence in sentences:
        words = sentence.split()
        total_words += len(words)

    average_sentence_length = total_words / num_sentences

    return average_sentence_length

In [None]:
def calculate_complex_word_percentage(text):
    dic = pyphen.Pyphen(lang='en')
    words = text.split()
    complex_word_count = 0
    total_word_count = 0
    syllable_count_per_word = []

    for word in words:
        word = word.rstrip('.,!?')
        if word.endswith('es') or word.endswith('ed'):
            base_word = word[:-2]
        else:
            base_word = word

        hyphenated_word = dic.inserted(base_word)
        syllables = hyphenated_word.count('-') + 1
        syllable_count_per_word.append(syllables)

        if syllables >= 2 or not word:
            complex_word_count += 1

        total_word_count += 1

    complex_word_percentage = (complex_word_count / total_word_count)

    return complex_word_percentage, complex_word_count, syllable_count_per_word

In [None]:
def fog_index(text):
  length = average_sentence_length(text)
  percent, _, _ = calculate_complex_word_percentage(text)
  fogindex = 0.4 * (length + percent)
  return fogindex

In [None]:
def average_no_words_per_sentence(text):
    sentences = text.split('. ')
    num_sentences = len(sentences)
    total_words = 0

    for sentence in sentences:
        words = sentence.split()
        total_words += len(words)

    average_no_words_per_sentence = total_words / num_sentences

    return average_no_words_per_sentence

In [None]:
def count_personal_pronouns(text):
    pattern = r'\b(I|we|my|ours|us)\b'
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    count = len(matches)

    return count

In [None]:
def calculate_average_word_length(text):
    words = text.split()
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    average_word_length = total_characters / total_words

    return average_word_length

In [None]:
data_dict = {
    "URL_ID" : input_data["URL_ID"].tolist(),
    "URL" : input_data["URL"].tolist(),
    "POSITIVE SCORE" : positive_score,
    "NEGATIVE SCORE" : negative_score,
    "POLARITY SCORE" : polarity_score,
    "SUBJECTIVE SCORE" : subjective_score,
    "AVG SENTENCE LENGTH" : [],
    "PERCENTAGE OF COMPLEX WORDS" : [],
    "FOG INDEX" : [],
    "AVG NUMBER OF WORDS PER SENTENCE" : [],
    "COMPLEX WORD COUNT": [],
    "WORD COUNT": total_cleaned_words,
    "SYLLABLE PER WORD": [],
    "PERSONAL PRONOUNS": [],
    "AVG WORD LENGTH": []
}

In [None]:
def textanalysis(files, data_dict):
  for item in files:
    data_dict["AVG SENTENCE LENGTH"].append(average_sentence_length(item))
    data_dict["FOG INDEX"].append(fog_index(item))
    data_dict["AVG NUMBER OF WORDS PER SENTENCE"].append(average_no_words_per_sentence(item))
    data_dict["PERSONAL PRONOUNS"].append(count_personal_pronouns(item))
    data_dict["AVG WORD LENGTH"].append(calculate_average_word_length(item))
    a, b, c = calculate_complex_word_percentage(item)
    data_dict["PERCENTAGE OF COMPLEX WORDS"].append(a)
    data_dict["COMPLEX WORD COUNT"].append(b)
    data_dict["SYLLABLE PER WORD"].append(c)
  return data_dict

In [None]:
output = textanalysis(files = all_files, data_dict = data_dict)

In [None]:
Output_Data_Structure = pd.DataFrame(output)

In [None]:
Output_Data_Structure.to_excel("Output_Data.xlsx", index = False)

In [None]:
Output_Data_Structure

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVE SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,61,32,0.311828,0.097895,23.733333,0.432022,9.666142,23.733333,769,950,"[1, 1, 2, 1, 2, 2, 2, 4, 1, 2, 1, 1, 1, 2, 1, ...",1,5.796067
1,38,https://insights.blackcoffer.com/what-if-the-c...,58,38,0.208333,0.175824,20.735294,0.263121,8.399366,20.735294,371,546,"[1, 1, 1, 2, 1, 2, 1, 1, 3, 2, 1, 1, 4, 1, 2, ...",7,4.991489
2,39,https://insights.blackcoffer.com/what-jobs-wil...,64,37,0.267327,0.125155,19.929412,0.399646,8.131623,19.929412,677,807,"[1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 4, 1, 1, 2, 1, ...",3,5.521251
3,40,https://insights.blackcoffer.com/will-machine-...,59,27,0.372093,0.142149,19.487805,0.306008,7.917525,19.487805,489,605,"[1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...",18,4.910513
4,41,https://insights.blackcoffer.com/will-ai-repla...,56,24,0.400000,0.107817,23.583333,0.337456,9.568316,23.583333,573,742,"[1, 1, 2, 1, 1, 1, 1, 1, 2, 4, 1, 1, 1, 3, 1, ...",18,5.212603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,22,26,-0.083333,0.116788,17.431373,0.363330,7.117881,17.431373,323,411,"[2, 1, 2, 4, 1, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, ...",9,5.505062
110,147,https://insights.blackcoffer.com/the-future-of...,32,9,0.560976,0.080868,22.833333,0.346715,9.272019,22.833333,380,507,"[1, 2, 1, 3, 1, 3, 1, 1, 2, 1, 1, 2, 1, 1, 3, ...",2,5.305657
111,148,https://insights.blackcoffer.com/big-data-anal...,26,44,-0.257143,0.125673,16.820896,0.371783,6.877072,16.820896,419,557,"[1, 1, 3, 1, 2, 2, 1, 3, 2, 1, 1, 2, 1, 1, 3, ...",2,5.187223
112,149,https://insights.blackcoffer.com/business-anal...,27,3,0.800000,0.107143,21.730769,0.405310,8.854432,21.730769,229,280,"[2, 3, 1, 1, 2, 2, 3, 1, 1, 4, 3, 2, 1, 4, 1, ...",0,5.683186
