# Final Project BlackCoffer: Data Extraction and NLP
## (The objective of this assignment is to extract textual data articles from the given URL and perform text analysis to compute variables)

## Candidate Name: Mayur Kumar Sharma


## 1. Importing Dependencies

In [1]:
import os
import pandas as pd
import openpyxl
import requests
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from nltk.corpus import cmudict
import textstat


## 2. Reading the input file

In [2]:
workbook = pd.read_excel('input1.xlsx')
workbook


Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...


### Note: due to bad weather at my location, the internet speed was slow, hence created small input file as "input1", & performed the necessary actions on it.

In [3]:
# Accessing the elements
url_id = []
url = []

for i in range(2):
    item1 = workbook['URL_ID'][i]
    item2 = workbook['URL'][i]
    url_id.append(item1)
    url.append(item2)

In [4]:
url_id

[37, 38]

In [5]:
url

['https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/',
 'https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/']

## 3. Code

In [6]:
# Create the output directory if it doesn't exist
if not os.path.exists('BlackOutput'):
    os.makedirs('BlackOutput')
    
pps = []
nps = []
pol = []
sub = []
asl = []
pcw = []
fog = []
words_per_sent = []
words_d = []
spw = []
pro = []
word_len = []

# Iterate over the links
for i, link in enumerate(url):

    # Fetch the HTML content of the page
    response = requests.get(link)
    html_content = response.content

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the text from the HTML using the get_text() method
    text = soup.get_text()

    # Save the extracted text to a file
    with open(f'BlackOutput/{url_id[i]}.txt', 'w') as f:
        f.write(text)
    
    
    
    # Opening the file as raw_doc
    f = open(f'BlackOutput/{url_id[i]}.txt', 'r', errors='ignore')
    raw_doc = f.read()
    raw_doc = raw_doc.lower()
    
    
    # Performing the tokenization 
    sentence_tokens = nltk.sent_tokenize(raw_doc)
    word_tokens = nltk.word_tokenize(raw_doc)
    
    
    # Updating the STOP-WORD list
    stop_words = set(stopwords.words("english"))
    for each in [',', '.', '{', '}', '[', ']', '(', ')', '/', '\n', '-','''"''', '&','for', '?', '!', '|', '&']:
        stop_words.add(each)
    
    
    # Removal of stop_words 
    list_of_words = word_tokenize(raw_doc)
    filtered_list = []   #We will collect the filtered words here 
    for word in list_of_words:
        if word.casefold() not in stop_words:
            filtered_list.append(word.lower())
            
    text_for_polarity = ' '.join(filtered_list)
    
    
    # Defining instance of Sentiment Analyzer
    sia = SentimentIntensityAnalyzer()
    polarity = sia.polarity_scores(text_for_polarity)
    
################################################################################################################################

    # Calculating the required variables
    
################################################################################################################################    

    #1. 
    positive_polarity_score = polarity['pos']
    
    #2.
    negative_polarity_score = polarity['neg']
    
    #3 & #4.
    # Create a TextBlob object
    blob = TextBlob(text)
    # Calculate the polarity score
    polarity_score = blob.sentiment.polarity   
    # Calculate the subjectivity score
    subjectivity_score = blob.sentiment.subjectivity
    
    #5 & #6. Average Sentance Length & Total Words Count ###########################################
    # Split the content into sentences using a sentence tokenizer
    sentences = sent_tokenize(text)
    # Calculate the total number of sentences
    total_sentences = len(sentences)    
    # Calculate the total number of words in the text
    words = word_tokenize(text)
    total_words = len(words)    
    # Calculate the average sentence length
    avg_sentence_length = total_words / total_sentences
    
    #7. Percent Complex Words #################################################################
    # Tokenize the text into words
    # words = nltk.word_tokenize(text)
    # Load the CMU Pronouncing Dictionary for counting syllables
    cmud = cmudict.dict()    
    # Count the number of complex words (words with three or more syllables)
    num_complex_words = sum([1 for word in words if len(cmud.get(word.lower(), [])) >= 3])
    # Calculate the total number of words
    # num_words = len(words)
    # Calculate the percentage of complex words
    percentage_complex_words = (num_complex_words / total_words) * 100
    
    #8. Fog Index #############################################################################
    fog_index = textstat.gunning_fog(text)
    
    
    #9. Syllables Per Word ####################################################################
    #split by whitespace
    words_split = text.split()  
    
    # load cmudict
    d = cmudict.dict()
    # function to count syllables in a word
    def count_syllables(word):
        # remove non-alphabetic characters from word
        word = ''.join(filter(str.isalpha, word.lower()))
        # check if word is in cmudict
        if word in d:
            # count the number of syllables in word
            return len(list(filter(lambda s: s[-1].isdigit(), d[word][0])))
        else:
            # if word is not in cmudict, return 1
            return 1
    # tokenize text into words
    # words = nltk.word_tokenize(text)
    # count syllables and words
    syllables = sum([count_syllables(w) for w in words])
    # words_count = len(words)
    # calculate syllables per word
    syllables_per_word = syllables / total_words
    
    #10. Pronouns ############################################################################
    # define personal pronouns
    personal_pronouns = ['I', 'me', 'mine', 'myself', 'we', 'us', 'ours', 'ourselves']
    # count the number of personal pronouns
    count = 0
    for word in words:
        if word.lower() in personal_pronouns:
            count += 1
    counts = count
    
    #11. Average Word Length
    # words = text.split()
    total_chars = sum(len(word) for word in words_split)
    avg_word_length = total_chars / len(words)

    ##########################################################################################################################
    
    # Appending into list variables
    
    ##########################################################################################################################
    
    pps.append(positive_polarity_score)
    nps.append(negative_polarity_score)
    pol.append(polarity_score)
    sub.append(subjectivity_score)
    
    asl.append(avg_sentence_length)
    pcw.append(percentage_complex_words)
    fog.append(fog_index)
    words_per_sent.append(avg_sentence_length)
    
    words_d.append(total_words)
    spw.append(syllables_per_word)
    pro.append(counts)
    word_len.append(avg_word_length)
    


## 4. Creating Final Dataframe

In [10]:
# Adding respective coulmns and feeding the respective variables for the given input file

workbook['Positive Score'] = pps
workbook['Negative Score'] = nps
workbook['Polarity Score'] = pol
workbook['Subjectivity Score'] = sub
workbook['Average Sentence Length'] = asl
workbook['Percent of Complex Words'] = pcw
workbook['Fog Index'] = fog
workbook['Words per Sentence'] = words_per_sent
workbook['Words Count'] = words_d
workbook['Syllable Per Word'] = spw
workbook['Personal Pronouns'] = pro
workbook['Average Word Length'] = word_len


workbook

Unnamed: 0,URL_ID,URL,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Sentence Length,Percent of Complex Words,Fog Index,Words per Sentence,Words Count,Syllable Per Word,Personal Pronouns,Average Word Length
0,37,https://insights.blackcoffer.com/ai-in-healthc...,0.176,0.038,0.145101,0.44049,36.070588,8.447489,16.9,36.070588,3066,1.704501,27,5.30561
1,38,https://insights.blackcoffer.com/what-if-the-c...,0.198,0.058,0.108266,0.409129,28.414894,10.146013,13.52,28.414894,2671,1.558592,31,4.788469


## 5. Exporting the output file

In [12]:
file_path = 'C:/Users/Mayur/Desktop/BlackCofferProject/output.xlsx'
workbook.to_excel(file_path, index=False)

# Thank You

### (Mayur Kumar Sharma)