<a href="https://colab.research.google.com/gist/vaidgaurav7/65e861186ebc0b51e9927f3c496e7d4b/blackcofferassignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Blackcoffer test assignment**
>  by Gaurav vaid

Notes:
- Upload Input.xlsx inside "*files*" (right bar 4 option)
- Connect/Reconnect (connect to a hosted runtime) (Left top option)
- Install below packages first (one time installation per session).
- Run Code in the end.
- Download Output Data Structure.xlsx from files area




In [1]:
!pip install pandas
!pip install beautifulsoup4
!pip install requests
!pip install textblob
!pip install nltk
!pip install syllables
!pip install textstat

Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.16-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<7.0,>=5.1 (from syllables)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)
Installing collected packages: importlib-metadata, cmudict, syllables
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 7.0.1
    Uninstalling importlib-metadata-7.0.1:
      Successfully uninstalled importlib-metadata-7.0.1
Successfully installed cmudict-1.0.16 importlib-metadata-6.11.0 syllables-1.0.9
Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyph

In [2]:
# Code to hit now
# explain & documented by me as much as possible

import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.tag import pos_tag


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to clean and preprocess text
def clean_text(text):
    clean_html = BeautifulSoup(text, 'html.parser').get_text()
    clean_text = re.sub("[^a-zA-Z]", " ", clean_html)
    words = word_tokenize(clean_text.lower())

    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]

    return words

# Function to calculate complex words
def get_complex_words(text):
    words = clean_text(text)

    tagged_words = pos_tag(words)
    complex_words = [word for word, tag in tagged_words if tag in ['NN', 'VB', 'RB', 'JJ']]

    return complex_words

# Function to calculate FOG index
def calculate_fog_index(text):
    words = clean_text(text)
    complex_words = get_complex_words(text)

    fog_index = 0.4 * ((len(words) / len(sent_tokenize(text))) + 100 * (len(complex_words) / len(words)))

    return fog_index

# Function to calculate syllables in a word
def syllable_count(word):
    word = word.lower()
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

# Function to calculate syllables per word
def calculate_syllables_per_word(text):
    words = clean_text(text)
    syllable_count_list = [syllable_count(word) for word in words]

    syllables_per_word = sum(syllable_count_list) / len(syllable_count_list) if len(syllable_count_list) > 0 else 0

    return syllables_per_word

# Function to perform text analysis
def analyze_text(url, text):

    blob = TextBlob(text)
    words = clean_text(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    fog_index = calculate_fog_index(text)

    sentences = sent_tokenize(text)

    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if len(sentence_lengths) > 0 else 0

    complex_words = get_complex_words(text)
    percentage_complex_words = (len(complex_words) / len(words)) * 100 if len(words) > 0 else 0

    avg_words_per_sentence = len(words) / len(sentences) if len(sentences) > 0 else 0
    complex_word_count = len(complex_words)
    word_count = len(words)
    syllable_per_word = calculate_syllables_per_word(text)
    personal_pronoun_count = len(re.findall(r'\b(?:I|me|my|mine|myself|we|us|our|ours|ourselves)\b', text))
    avg_word_length = sum(len(word) for word in words) / len(words) if len(words) > 0 else 0

    return {
        'URL_ID': url_id,
        'POSITIVE SCORE': polarity_score,
        'NEGATIVE SCORE': 1 - polarity_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronoun_count,
        'AVG WORD LENGTH': avg_word_length,
    }


input_df = pd.read_excel("Input.xlsx")

output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                  'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                  'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
output_df = pd.DataFrame(columns=output_columns)


for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    try:
        response = requests.get(url)
        html_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        continue

    soup = BeautifulSoup(html_content, 'html.parser')
    article_text = ""

    for paragraph in soup.find_all('p'):
        article_text += paragraph.get_text() + " "

    analysis_result = analyze_text(url, article_text)
    output_df = output_df.append(analysis_result, ignore_index=True)

output_df.to_excel("Output Data Structure.xlsx", index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = output_df.append(analysis_result, ignore_index=True)
  output_df = outpu

In [3]:
output_df

Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,0.196573,0.803427,0.196573,0.521390,18.758621,69.230769,32.175066,11.206897,225,325,2.015385,3,6.332308
1,blackassign0002,0.119430,0.880570,0.119430,0.427387,19.548780,62.145749,29.677812,12.048780,614,988,2.281377,8,7.059717
2,blackassign0003,0.099834,0.900166,0.099834,0.425040,20.688525,64.285714,31.130679,13.540984,531,826,2.481840,21,7.613801
3,blackassign0004,0.046584,0.953416,0.046584,0.406320,22.285714,64.153276,31.439882,14.446429,519,809,2.355995,7,7.532756
4,blackassign0005,0.110367,0.889633,0.110367,0.492246,20.441860,63.008850,30.459354,13.139535,356,565,2.145133,7,6.874336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,0.060446,0.939554,0.060446,0.390710,24.388889,69.900498,33.915755,14.888889,562,804,2.176617,4,6.799751
96,blackassign0097,0.068466,0.931534,0.068466,0.433493,30.333333,62.481315,31.363955,15.928571,418,669,1.998505,13,6.406577
97,blackassign0098,0.065476,0.934524,0.065476,0.379258,38.222222,67.213115,37.729690,27.111111,164,244,2.057377,1,6.446721
98,blackassign0099,0.085992,0.914008,0.085992,0.398734,23.156250,66.960352,32.459141,14.187500,304,454,1.977974,5,6.207048
