In [1]:
from bs4 import BeautifulSoup
import requests
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
import os
import spacy
import pyphen

In [2]:
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")
# Create a Pyphen instance for syllable counting
dic = pyphen.Pyphen(lang='en')
def count_syllables(word):
    return len(dic.inserted(word).split('-'))
def count_complex_words(text):
    doc = nlp(text)
    complex_word_count = 0
    for token in doc:
        # Check if the token has more than two syllables
        if count_syllables(token.text) > 2:
            complex_word_count += 1
    return complex_word_count


In [14]:
def get_articles(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all elements in the document
    all_elements = soup.find_all()

    # Extract text from <p> tags only
    heading_texts = []
    for element in all_elements:
        if element.name == 'h1':
            heading_texts.append(element.text.strip())
    # Join the extracted texts from <p> tags
    article_title= '\n'.join(heading_texts)

    # Find all elements in the document
    all_elements = soup.find_all()

    # Extract text from <p> tags only
    paragraph_texts = []
    for element in all_elements:
        if element.name == 'p':
            paragraph_texts.append(element.text.strip())

    # Join the extracted texts from <p> tags
    article_text = '\n'.join(paragraph_texts)
    return article_title, article_text

In [15]:
def get_text(output_file,url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # print(soup)
    # Find the article title (assuming it's within an <h1> tag)
    article_title, article_text=get_articles(url)
    #to resolve errors
    article_title = article_title.replace('\u202f', '')
    article_text = article_text.replace('\u202f', '')
    article_title = article_title.replace('\uf0b7', '')
    article_text = article_text.replace('\uf0b7', '')
    article_title = article_title.replace('\u2033', '')
    article_text = article_text.replace('\u2033', '')
    article_title = article_title.replace('\u20b9', '')
    article_text = article_text.replace('\u20b9', '')
    article_title = article_title.replace('\u2248', '')
    article_text = article_text.replace('\u2248', '')
    with open(output_file, 'w', encoding='cp1252') as file:
        file.write(f"{article_title}\n\n")
        file.write(f"{article_text}\n")
    print(f"Data has been saved to {output_file} file.")
    directory=r'C:\Users\suhaib mukhtar\IdeaProjects\Assignment_black\StopWords'
    folders=os.listdir(directory)
    stops=[]
    for file in folders:
        file_path=os.path.join(directory,file)
        # Open the file in read mode
        with open(file_path, 'r', encoding='cp1252') as file:
            # Iterate through each line in the file
            for line in file:
                text=line.strip()  # Print the line after stripping newline characters
                tokens=text.split()
                stops.extend(tokens)
    # Open the file in read mode
    new_text=[]
    with open(output_file, 'r', encoding='cp1252') as file:
        for line in file:
            # Process each line here
            text=line.strip()
            tokens=text.split()
            for word in tokens:
                new_text.append(word)
                text=" ".join(new_text)
    new_text=[]
    punc=['!','#','$','%',"(",')',',','-','.','/',':',';','=','?','@','[',']','_','|']
    tokens=word_tokenize(text)
    for word in tokens:
        if word not in punc:
            new_text.append(word)
    text=" ".join(new_text)
    # Define the directory path and file names
    directory_path = r'C:\Users\suhaib mukhtar\IdeaProjects\Assignment_black\MasterDictionary'
    positive_file_name = 'positive-words.txt'
    negative_file_name = 'negative-words.txt'

    # Define the full file paths
    positive_file_path = os.path.join(directory_path, positive_file_name)
    negative_file_path = os.path.join(directory_path, negative_file_name)

    # Initialize empty lists to store the content of the files
    positive_words = []
    negative_words = []

    # Read the positive.txt file
    with open(positive_file_path, 'r', encoding='cp1252') as positive_file:
        positive_words = positive_file.read().splitlines()

    # Read the negative.txt file
    with open(negative_file_path, 'r', encoding='cp1252') as negative_file:
        negative_words = negative_file.read().splitlines()
    ##Add the words to the positive of negative list if word is not in stops
    for word in word_tokenize(text):
        if word not in stops:
            text_blob=TextBlob(word)
            score=text_blob.sentiment.polarity
            if score>=0:
                positive_words.append(word)
            else:
                negative_words.append(word)
    positive=0
    negative=0
    tokens=word_tokenize(text)
    for word in tokens:
        if word in positive_words:
            positive+=1
        elif word in negative_words:
            negative+=1
    negative=-negative
    Polarity_Score = (positive-negative)/((positive + negative) + 0.000001)
    #word length
    new_text=[]
    for word in text.split():
        if word not in stops:
            new_text.append(word)
            total_words_a_c=len(new_text)
    Subjectivity_Score =(positive + negative)/ ((total_words_a_c) + 0.000001)
    ##Analysis of readibility
    sentences=sent_tokenize(text)
    sent_no=len(sentences)
    Avg_sent_len= total_words_a_c/sent_no
    #complex word count
    complex_word_count = count_complex_words(text)
    #%age of complex word
    Percentage_of_Complex_words = complex_word_count / total_words_a_c
    #fog index
    Fog_Index = 0.4 * (Avg_sent_len+ Percentage_of_Complex_words)
    Average_Number_of_Words_Per_Sentence =  total_words_a_c/ sent_no
    rs={
        'URL': url,
        'FileName': output_file,
        'positive_words':len(positive_words),
        'negative_words':len(negative_words),
        'positive_score':positive,
        'negative_score':negative,
        'Polarity_Score':Polarity_Score,
        'total_words_a_c':total_words_a_c,
        'Subjectivity_Score':Subjectivity_Score,
        'Avg_sent_len':Avg_sent_len,
        'complex_word_count':complex_word_count,
        'Percentage_of_Complex_words':Percentage_of_Complex_words,
        'Fog_Index':Fog_Index,
        'Average_Number_of_Words_Per_Sentence':Average_Number_of_Words_Per_Sentence,
    }
    return rs



In [16]:
import pandas as pd
data=pd.read_excel(r'C:\Users\suhaib mukhtar\IdeaProjects\Assignment_black\input.xlsx')

In [17]:
data.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [18]:
data['file_name']=data['URL_ID'].apply(lambda x:str(int(x))+'.txt')

In [19]:
data

Unnamed: 0,URL_ID,URL,file_name
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,123.txt
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,321.txt
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,2345.txt
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,4321.txt
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,432.txt
...,...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...,50921.txt
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...,51382.txt
111,51844.6,https://insights.blackcoffer.com/what-are-the-...,51844.txt
112,52306.4,https://insights.blackcoffer.com/marketing-dri...,52306.txt


In [20]:
data.drop(['URL_ID'],axis=1,inplace=True)

In [21]:
urls=[]
for url in data.URL:
    urls.append(url)
file_names=[]
for file in data.file_name:
    file_names.append(file)

In [22]:
results=[]

In [23]:
# class_name='td-post-content tagdiv-type'
# Iterate through the URLs and file names, call the function, and append the result to the list
for url, file_name in zip(urls, file_names):
    result = get_text(file_name,url)
    results.append(result)
# Create a DataFrame from the results list
df = pd.DataFrame(results)
# Print the DataFrame
print(df)


Data has been saved to 123.txt file.
Data has been saved to 321.txt file.
Data has been saved to 2345.txt file.
Data has been saved to 4321.txt file.
Data has been saved to 432.txt file.
Data has been saved to 2893.txt file.
Data has been saved to 3355.txt file.
Data has been saved to 3817.txt file.
Data has been saved to 4279.txt file.
Data has been saved to 4741.txt file.
Data has been saved to 5202.txt file.
Data has been saved to 5664.txt file.
Data has been saved to 6126.txt file.
Data has been saved to 6588.txt file.
Data has been saved to 7050.txt file.
Data has been saved to 7511.txt file.
Data has been saved to 7973.txt file.
Data has been saved to 8435.txt file.
Data has been saved to 8897.txt file.
Data has been saved to 9359.txt file.
Data has been saved to 9820.txt file.
Data has been saved to 10282.txt file.
Data has been saved to 10744.txt file.
Data has been saved to 11206.txt file.
Data has been saved to 11668.txt file.
Data has been saved to 12129.txt file.
Data has b

In [25]:
df

Unnamed: 0,URL,FileName,positive_words,negative_words,positive_score,negative_score,Polarity_Score,total_words_a_c,Subjectivity_Score,Avg_sent_len,complex_word_count,Percentage_of_Complex_words,Fog_Index,Average_Number_of_Words_Per_Sentence
0,https://insights.blackcoffer.com/rise-of-telem...,123.txt,3050,4817,1051,-34,1.066863,1073,0.947810,178.833333,243,0.226468,71.623920,178.833333
1,https://insights.blackcoffer.com/rise-of-e-hea...,321.txt,2453,4792,450,-9,1.040816,456,0.967105,456.000000,146,0.320175,182.528070,456.000000
2,https://insights.blackcoffer.com/rise-of-e-hea...,2345.txt,2740,4799,739,-15,1.041436,750,0.965333,750.000000,181,0.241333,300.096533,750.000000
3,https://insights.blackcoffer.com/rise-of-telem...,4321.txt,2843,4800,846,-17,1.041013,853,0.971864,426.500000,213,0.249707,170.699883,426.500000
4,https://insights.blackcoffer.com/rise-of-telem...,432.txt,2843,4800,846,-17,1.041013,853,0.971864,426.500000,213,0.249707,170.699883,426.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,https://insights.blackcoffer.com/coronavirus-i...,50921.txt,2440,4789,437,-6,1.027842,440,0.979545,440.000000,100,0.227273,176.090909,440.000000
110,https://insights.blackcoffer.com/coronavirus-i...,51382.txt,3153,4807,1152,-24,1.042553,1171,0.963279,1171.000000,215,0.183604,468.473442,1171.000000
111,https://insights.blackcoffer.com/what-are-the-...,51844.txt,3077,4819,1082,-36,1.068834,1107,0.944896,1107.000000,274,0.247516,442.899006,1107.000000
112,https://insights.blackcoffer.com/marketing-dri...,52306.txt,2875,4801,876,-18,1.041958,887,0.967306,887.000000,211,0.237880,354.895152,887.000000


In [26]:
df.columns

Index(['URL', 'FileName', 'positive_words', 'negative_words', 'positive_score',
       'negative_score', 'Polarity_Score', 'total_words_a_c',
       'Subjectivity_Score', 'Avg_sent_len', 'complex_word_count',
       'Percentage_of_Complex_words', 'Fog_Index',
       'Average_Number_of_Words_Per_Sentence'],
      dtype='object')

In [27]:
df[['FileName','URL', 'positive_words', 'negative_words', 'positive_score',
       'negative_score', 'Polarity_Score', 'total_words_a_c',
       'Subjectivity_Score', 'Avg_sent_len', 'complex_word_count',
       'Percentage_of_Complex_words', 'Fog_Index',
       'Average_Number_of_Words_Per_Sentence']]

Unnamed: 0,FileName,URL,positive_words,negative_words,positive_score,negative_score,Polarity_Score,total_words_a_c,Subjectivity_Score,Avg_sent_len,complex_word_count,Percentage_of_Complex_words,Fog_Index,Average_Number_of_Words_Per_Sentence
0,123.txt,https://insights.blackcoffer.com/rise-of-telem...,3050,4817,1051,-34,1.066863,1073,0.947810,178.833333,243,0.226468,71.623920,178.833333
1,321.txt,https://insights.blackcoffer.com/rise-of-e-hea...,2453,4792,450,-9,1.040816,456,0.967105,456.000000,146,0.320175,182.528070,456.000000
2,2345.txt,https://insights.blackcoffer.com/rise-of-e-hea...,2740,4799,739,-15,1.041436,750,0.965333,750.000000,181,0.241333,300.096533,750.000000
3,4321.txt,https://insights.blackcoffer.com/rise-of-telem...,2843,4800,846,-17,1.041013,853,0.971864,426.500000,213,0.249707,170.699883,426.500000
4,432.txt,https://insights.blackcoffer.com/rise-of-telem...,2843,4800,846,-17,1.041013,853,0.971864,426.500000,213,0.249707,170.699883,426.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,50921.txt,https://insights.blackcoffer.com/coronavirus-i...,2440,4789,437,-6,1.027842,440,0.979545,440.000000,100,0.227273,176.090909,440.000000
110,51382.txt,https://insights.blackcoffer.com/coronavirus-i...,3153,4807,1152,-24,1.042553,1171,0.963279,1171.000000,215,0.183604,468.473442,1171.000000
111,51844.txt,https://insights.blackcoffer.com/what-are-the-...,3077,4819,1082,-36,1.068834,1107,0.944896,1107.000000,274,0.247516,442.899006,1107.000000
112,52306.txt,https://insights.blackcoffer.com/marketing-dri...,2875,4801,876,-18,1.041958,887,0.967306,887.000000,211,0.237880,354.895152,887.000000


### Output file to be stored

In [3]:
df.to_csv('output_results_file_csv.csv')