# blackcoffer-assignment

OBJECTIVE: To extract textual data articles from the given URL and perform text analysis to compute variables such as positivity, negativity, subjectivity, polarity, fog score, readability, etc.

In [3]:
!pip install numpy pandas openpyxl nltk textblob textstat --quiet

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import textstat

In [3]:
import nltk
import re
from textblob import TextBlob
from nltk.corpus import cmudict
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package cmudict to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [4]:
# Read the input file
df = pd.read_excel('Input.xlsx')
output = pd.read_excel('Output Data Structure.xlsx')

In [5]:
df

Unnamed: 0,URL_ID,URL
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...
3,40.0,https://insights.blackcoffer.com/will-machine-...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...
110,147.0,https://insights.blackcoffer.com/the-future-of...
111,148.0,https://insights.blackcoffer.com/big-data-anal...
112,149.0,https://insights.blackcoffer.com/business-anal...


In [6]:
# Get the CMU pronunciation dictionary
cmu_dict = cmudict.dict()

In [7]:
# Define the list of personal pronouns
personal_pronouns = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']

In [8]:
def clean_para(para):
    para_to_remove_1 = "Ranking customer behaviours for business strategyAlgorithmic trading for multiple commodities markets, like Forex, Metals, Energy, etc.Trading Bot for FOREXPython model for the analysis of sector-specific stock ETFs for investment purposesPlaystore & Appstore to Google Analytics (GA) or Firebase to Google Data Studio Mobile App KPI DashboardGoogle Local Service Ads LSA API To Google BigQuery to Google Data StudioAI Conversational Bot using RASARecommendation System ArchitectureRise of telemedicine and its Impact on Livelihood by 2040Rise of e-health and its impact on humans by the year 2030Rise of e-health and its impact on humans by the year 2030Rise of telemedicine and its Impact on Livelihood by 2040AI/ML and Predictive ModelingSolution for Contact Centre ProblemsHow to Setup Custom Domain for Google App Engine Application?Code Review Checklist"
    para_to_remove_2 = "Contact us: hello@blackcoffer.com© All Right Reserved, Blackcoffer(OPC) Pvt. Ltd"
    new_para = para.replace(para_to_remove_1, "")
    new_para = new_para.replace(para_to_remove_2, "")
    return new_para

In [9]:
def preprocess_para(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    sentence = sen.lower()

    # Remove RT
    sentence = re.sub('RT @\w+: '," ", sentence)

    # Remove special characters
    sentence = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    return sentence

In [10]:
# Function to find Positivity, Negativity, Polarity and Subjectivity in sentence
def pnps(paragraph):
    # Create a TextBlob object
    blob = TextBlob(paragraph)

    # Get the sentiment analysis
    sentiment = blob.sentiment

    # Print the results
    positivity = sentiment.polarity
    negativity = 1 - sentiment.polarity
    polarity = sentiment.subjectivity
    subjectivity = 1 - sentiment.subjectivity
    
    return positivity,negativity,polarity,subjectivity

In [11]:
# Define a function to count the number of syllables in a word
def count_syllables(word):
    if word.lower() not in cmu_dict:
        return 0
    return max([len([y for y in x if y[-1].isdigit()]) for x in cmu_dict[word.lower()]])

In [None]:
output['READABILITY'] = ''

In [12]:
for i in range(len(df)):
    # Define the URL of the article
    url = df.URL[i]
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content of the article
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the text of the article
    article_text = ""
    for paragraph in soup.find_all("p"):
        article_text += paragraph.get_text()
        
    article_text = clean_para(article_text)
    
    # Find total number of sentences and words in article
    tot_sentences = len(nltk.sent_tokenize(article_text))
    tot_words = len(nltk.word_tokenize(article_text))
    
    # Calculate average sentence length
    output['AVG SENTENCE LENGTH'][i] = tot_words/tot_sentences
    output['AVG NUMBER OF WORDS PER SENTENCE'][i] = sum(len(nltk.word_tokenize(sentence)) for sentence in nltk.sent_tokenize(article_text)) / tot_sentences
    
    cleaned_text = ""
    for sen in nltk.sent_tokenize(article_text):
        sen = preprocess_para(sen)
        cleaned_text += sen
        
    # Calculate word count
    words = nltk.word_tokenize(cleaned_text)
    # Calculate the total length of all the words
    total_length = sum(len(word) for word in words)
    
    output['WORD COUNT'][i] = len(words)
    # Calculate the average length of a word
    output['AVG WORD LENGTH'][i] = total_length / len(words)
    
    # Get the set of stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    # Calculate the number of complex words
    complex_words = [word for word in words if len(word) > 2 and word not in stopwords]
    
    # Calculate the percentage of complex words
    percent_complex_words = (len(complex_words) / len(words)) * 100
    
    output['COMPLEX WORD COUNT'][i] = len(complex_words)
    output['PERCENTAGE OF COMPLEX WORDS'][i] = percent_complex_words
    
    # Calculate the Fog index
    output['FOG INDEX'][i] = 0.4 * (tot_words/tot_sentences + percent_complex_words)
    
    # Calculate the total number of syllables in the paragraph
    output['SYLLABLE PER WORD'][i] = (sum(count_syllables(word) for word in words))/len(words)
    
    # Find the personal pronouns in the paragraph
    pronouns_found = [word for word in words if word in personal_pronouns]
    output['PERSONAL PRONOUNS'][i] = len(pronouns_found)
    
    # Save the article text as .txt file with URL_ID as its name
    url_id = int(df.URL_ID[i])
    with open(f"{url_id}", "w", encoding="utf-8") as f:
        f.write(cleaned_text)
        
    pnps_data = pnps(cleaned_text)
    output['POSITIVE SCORE'][i] = pnps_data[0]
    output['NEGATIVE SCORE'][i] = pnps_data[1]
    output['POLARITY SCORE'][i] = pnps_data[2]
    output['SUBJECTIVITY SCORE'][i] = pnps_data[3]
    output['READABILTY'][i] = textstat.automated_readability_index(article_text)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['AVG SENTENCE LENGTH'][i] = tot_words/tot_sentences
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['AVG NUMBER OF WORDS PER SENTENCE'][i] = sum(len(nltk.word_tokenize(sentence)) for sentence in nltk.sent_tokenize(article_text)) / tot_sentences
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['WORD COUNT'][i] = len(words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.o

In [13]:
output

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,0.136403,0.863597,0.460545,0.539455,37.490566,62.969095,40.183864,37.490566,1141.0,1812.0,1.818985,31.0,5.673289
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,0.077108,0.922892,0.430135,0.569865,24.294118,52.410901,30.682008,24.294118,750.0,1431.0,1.610762,70.0,4.888889
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,0.113979,0.886021,0.487566,0.512434,28.358209,59.042553,34.960305,28.358209,999.0,1692.0,1.802009,40.0,5.511229
3,40.0,https://insights.blackcoffer.com/will-machine-...,0.141805,0.858195,0.493430,0.506570,23.560000,53.797866,30.943146,23.560000,857.0,1593.0,1.639046,63.0,4.929065
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,0.060288,0.939712,0.504724,0.495276,29.059701,56.435644,34.198138,29.059701,969.0,1717.0,1.670355,71.0,5.117647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...,0.035383,0.964617,0.371784,0.628216,26.315789,61.889251,35.282016,26.315789,570.0,921.0,1.674267,25.0,5.406080
110,147.0,https://insights.blackcoffer.com/the-future-of...,0.080112,0.919888,0.403132,0.596868,31.100000,59.678858,36.311543,31.100000,669.0,1121.0,1.672614,33.0,5.241748
111,148.0,https://insights.blackcoffer.com/big-data-anal...,0.047038,0.952962,0.419783,0.580217,21.114754,59.542656,32.262964,21.114754,677.0,1137.0,1.744943,23.0,5.185576
112,149.0,https://insights.blackcoffer.com/business-anal...,0.198648,0.801352,0.547654,0.452346,39.937500,58.952703,39.556081,39.937500,349.0,592.0,1.853041,19.0,5.699324


In [16]:
output.to_excel('Output Data Structure.xlsx', index=False)

In [15]:
output

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,0.136403,0.863597,0.460545,0.539455,37.490566,62.969095,40.183864,37.490566,1141.0,1812.0,1.818985,31.0,5.673289
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,0.077108,0.922892,0.430135,0.569865,24.294118,52.410901,30.682008,24.294118,750.0,1431.0,1.610762,70.0,4.888889
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,0.113979,0.886021,0.487566,0.512434,28.358209,59.042553,34.960305,28.358209,999.0,1692.0,1.802009,40.0,5.511229
3,40.0,https://insights.blackcoffer.com/will-machine-...,0.141805,0.858195,0.493430,0.506570,23.560000,53.797866,30.943146,23.560000,857.0,1593.0,1.639046,63.0,4.929065
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,0.060288,0.939712,0.504724,0.495276,29.059701,56.435644,34.198138,29.059701,969.0,1717.0,1.670355,71.0,5.117647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...,0.035383,0.964617,0.371784,0.628216,26.315789,61.889251,35.282016,26.315789,570.0,921.0,1.674267,25.0,5.406080
110,147.0,https://insights.blackcoffer.com/the-future-of...,0.080112,0.919888,0.403132,0.596868,31.100000,59.678858,36.311543,31.100000,669.0,1121.0,1.672614,33.0,5.241748
111,148.0,https://insights.blackcoffer.com/big-data-anal...,0.047038,0.952962,0.419783,0.580217,21.114754,59.542656,32.262964,21.114754,677.0,1137.0,1.744943,23.0,5.185576
112,149.0,https://insights.blackcoffer.com/business-anal...,0.198648,0.801352,0.547654,0.452346,39.937500,58.952703,39.556081,39.937500,349.0,592.0,1.853041,19.0,5.699324
