<a href="https://colab.research.google.com/github/shrideep-tamboli/Text-Analysis/blob/main/Blackcoffer_Text_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Web Scraping and Text Analysis

#1. Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


* Loading the input.xlsx file with all the URLs

In [2]:
import openpyxl
url_file_path = '/content/gdrive/MyDrive/Blackcoffer/Input.xlsx'
url_input = openpyxl.load_workbook(url_file_path)
sheet = url_input.active

* Importing the Stopwords Bag of words

In [3]:
import os
# Define the path to the StopWords folder
stopwords_folder = '/content/gdrive/MyDrive/Blackcoffer/StopWords'

# Get the list of stop words files in the folder
stopwords_files = [f for f in os.listdir(stopwords_folder) if f.endswith('.txt')]

# Create a set to store all the stop words
stopwords1 = set()

# Read the stop words from each file and add them to the set
for file_name in stopwords_files:
    file_path = os.path.join(stopwords_folder, file_name)
    with open(file_path, 'r', encoding='latin-1') as file:
        stopword_list = [word.lower() for word in file.read().split()]
        stopwords1.update(stopword_list)




# 2. Web Scraping,
# 3. Data Cleaning

In [4]:
import requests
from bs4 import BeautifulSoup
import nltk
import re
import os

# Download the stop words list
nltk.download('punkt')
nltk.download('stopwords')

# Iterate over each row in the sheet to get url and url_id
for row in sheet.iter_rows(min_row=2, values_only=True):
    url_id = row[0]
    url = row[1]

    # Send a GET request to the URL
    response = requests.get(url)
    URL_content = response.text

    #-----------------------------
    #--------Web Scraping---------
    #-----------------------------

    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(URL_content, 'html.parser')

    # Find the article title and text elements
    title_element = soup.find('h1')
    text_elements = soup.find_all('p')

    # Extract the text from the elements and the title
    title = title_element.get_text() if title_element else ''
    text = ' '.join(element.get_text() for element in text_elements)

    # Remove unwanted text
    unwanted_text1 = "We provide intelligence, accelerate innovation and implement technology with extraordinary breadth and depth global insights into the big data,data-driven dashboards, applications development, and information management for organizations through combining unique, specialist services and high-level human expertise. Contact us: hello@blackcoffer.com"
    unwanted_text2 = "© All Right Reserved, Blackcoffer(OPC) Pvt. Ltd"
    unwanted_text3 = "Ranking customer behaviours for business strategy Algorithmic trading for multiple commodities markets, like Forex, Metals, Energy, etc. Trading Bot for FOREX Python model for the analysis of sector-specific stock ETFs for investment purposes AutoGPT Setup Playstore & Appstore to Google Analytics (GA) or Firebase to Google Data Studio Mobile App KPI Dashboard Google Local Service Ads LSA API To Google BigQuery to Google Data Studio AI Conversational Bot using RASA Rise of telemedicine and its Impact on Livelihood by 2040 Rise of e-health and its impact on humans by the year 2030 Rise of e-health and its impact on humans by the year 2030 Rise of telemedicine and its Impact on Livelihood by 2040 AI/ML and Predictive Modeling Solution for Contact Centre Problems How to Setup Custom Domain for Google App Engine Application? Code Review Checklist"

    text = text.replace(unwanted_text1, "")
    text = text.replace(unwanted_text2, "")
    text = text.replace(unwanted_text3, "")

    # Tokenize the text
    tokenized_text = nltk.word_tokenize(text.lower())

    # Preprocessing: Remove unwanted characters using regex
    filtered_text = [re.sub(r'[^\w\s]', '', word) for word in tokenized_text]
    filtered_text = [re.sub(r'\d+', '', word) for word in filtered_text]

    #-----------------------------------
    #-----------Data Cleaning-----------
    #-----------------------------------

    # Remove stop words
    filtered_text = [word for word in filtered_text if word.lower() not in stopwords1]

    # Join the filtered text back into a string
    cleaned_text = ' '.join(filtered_text)

    # Save the extracted article in a text file
    file_name = f'{url_id}.txt'
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f'{title}\n\n{cleaned_text}')

    print(f'Saved article {url_id} to {file_name}')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saved article 123.0 to 123.0.txt
Saved article 321.0 to 321.0.txt
Saved article 2345.0 to 2345.0.txt
Saved article 4321.0 to 4321.0.txt
Saved article 432.0 to 432.0.txt
Saved article 2893.8 to 2893.8.txt
Saved article 3355.6 to 3355.6.txt
Saved article 3817.4 to 3817.4.txt
Saved article 4279.2 to 4279.2.txt
Saved article 4741.0 to 4741.0.txt
Saved article 5202.8 to 5202.8.txt
Saved article 5664.6 to 5664.6.txt
Saved article 6126.4 to 6126.4.txt
Saved article 6588.2 to 6588.2.txt
Saved article 7050.0 to 7050.0.txt
Saved article 7511.8 to 7511.8.txt
Saved article 7973.6 to 7973.6.txt
Saved article 8435.4 to 8435.4.txt
Saved article 8897.2 to 8897.2.txt
Saved article 9359.0 to 9359.0.txt
Saved article 9820.8 to 9820.8.txt
Saved article 10282.6 to 10282.6.txt
Saved article 10744.4 to 10744.4.txt
Saved article 11206.2 to 11206.2.txt
Saved article 11668.0 to 11668.0.txt
Saved article 12129.8 to 12129.8.txt
Saved article 12591.6 to 12591.6.txt
Saved article 13053.4 to 13053.4.txt
Saved articl

In [5]:
pip install pyphen

Collecting pyphen
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.0 MB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m1.9/2.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen
Successfully installed pyphen-0.14.0


#4. Text Analysis

In [6]:
import pandas as pd
import glob
import re
import nltk
import pyphen

positive_dict_path = '/content/gdrive/MyDrive/Blackcoffer/MasterDictionary/positive-words.txt'
negative_dict_path = '/content/gdrive/MyDrive/Blackcoffer/MasterDictionary/negative-words.txt'

positive_words = set()
negative_words = set()

# Read positive words from the dictionary file
with open(positive_dict_path, 'r', encoding='latin-1') as file:
    positive_words = {word.strip().lower() for word in file}

# Read negative words from the dictionary file
with open(negative_dict_path, 'r', encoding='latin-1') as file:
    negative_words = {word.strip().lower() for word in file}

# Create a Pyphen instance for syllable counting
dic = pyphen.Pyphen(lang='en')

results = []

# Get the list of saved .txt files
txt_files = glob.glob('*.txt')

# Function to count syllables in a word
def count_syllables(word):
    dic = pyphen.Pyphen(lang='en')
    syllables = len(dic.inserted(word).split('-'))
    return syllables

# Regular expression pattern to match personal pronouns
personal_pronoun_pattern = re.compile(r'\b(I|we|my|ours|us)\b', re.IGNORECASE)

# Iterate through the saved .txt files
for file_name in txt_files:
    url_id = int(file_name.split('.')[0])

    with open(file_name, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Extract the title and cleaned_text from the file
    title = lines[0].strip()
    text = ' '.join(lines[1:])  # Join all lines to form the cleaned_text

    # Preprocessing: Remove unwanted characters using regex
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)

    "#1 Calculate positive score"
    positive_score = sum(1 for word in cleaned_text.split() if word.lower() in positive_words)

    "#2 Calculate negative score (multiply by -1)"
    negative_score = -1 * sum(-1 for word in cleaned_text.split() if word.lower() in negative_words)

    "#3 Calculate polarity score"
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    "#4 Calculate subjective score"
    subjective_score = (positive_score + negative_score) / (len(cleaned_text.split()) + 0.000001)

    "#5 Avg Sentence Length"
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(cleaned_text)
    total_sentences = len(sentences)
    avg_sentence_length = len(cleaned_text.split()) / total_sentences

    "#6 Percentage of Complex Words"
    # Define the complexity criteria (syllable threshold)
    syllable_threshold = 2
    # Tokenize the text into words
    words = nltk.word_tokenize(cleaned_text)
    # Initialize the count of complex words and syllables
    complex_word_count = 0
    total_syllables = 0

    # Iterate through each word and check if it meets the complexity criteria
    for word in words:
        # Count syllables using the custom function
        syllables = count_syllables(word)

        # Handle exceptions for words ending with "es" or "ed"
        if word.endswith('es') or word.endswith('ed'):
            syllables -= 1

        if syllables >= syllable_threshold:
            complex_word_count += 1

        total_syllables += syllables

    #Percent of complex words
    POCW = 100 * complex_word_count / len(cleaned_text.split())

    "#7 Fog Index"
    fog_index = 0.4 * (avg_sentence_length + POCW)

    "#8 Calculate the average number of words per sentence"
    AWPS = len(words) / (total_sentences or 1)  # Handle zero division

    "#9 Complex word count"
    complex_word_count = complex_word_count

    "#10 Count the number of words and sentences"
    num_words = len(words)

    "#11 Syllable per word"
    scpw = total_syllables/num_words

    "#12 Count personal pronouns using regex"
    # Find personal pronoun matches
    personal_pronoun_matches = personal_pronoun_pattern.findall(text)
    personal_pronoun_count = len(personal_pronoun_matches)

    "#13 Average word length"
    total_characters = sum(len(word) for word in words)
    average_word_length = total_characters / len(words) if len(words) > 0 else 0

    # Append the result to the results list
    results.append((url_id, title, positive_score, negative_score, polarity_score, subjective_score,
                    avg_sentence_length, POCW, fog_index, AWPS, complex_word_count, num_words, scpw,
                    average_word_length))

# Create a DataFrame from the results list
df = pd.DataFrame(results, columns=['URL_ID', 'URL', 'Positive Score', 'Negative Score', 'Polarity Score',
                                    'Subjectivity Score', 'Avg Sentence Length', 'Percentage of complex words',
                                    'Fog Index', "Average Number of Words per Sentence", 'Complex word count', 'Word Count', 'Syllable per word',
                                    'Avg Word Length'])

# Save the DataFrame to an XLSX file
output_file = 'OUTPUT.xlsx'
df.to_excel(output_file, index=False)

print(f'Scores saved to {output_file}')


Scores saved to OUTPUT.xlsx
