In [1]:
!pip install vaderSentiment

Defaulting to user installation because normal site-packages is not writeable
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
     ---------------------------------------- 0.0/126.0 kB ? eta -:--:--
     ------------------- ------------------- 61.4/126.0 kB 1.6 MB/s eta 0:00:01
     -------------------------------------- 126.0/126.0 kB 1.5 MB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import requests
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHUBHAM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHUBHAM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove scripts and styles from the HTML
    for script in soup(['script', 'style']):
        script.extract()

    # Extract article text
    article_text = ""
    for paragraph in soup.find_all('p'):
        article_text += paragraph.get_text() + "\n"

    return article_text.strip()

In [4]:
def compute_word_count(text):
    tokens = word_tokenize(text)
    return len(tokens)

def compute_unique_word_count(text):
    tokens = word_tokenize(text)
    unique_tokens = set(tokens)
    return len(unique_tokens)

def compute_stopword_count(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    stopword_count = sum(1 for word in tokens if word.lower() in stop_words)
    return stopword_count

In [5]:
def compute_most_common_words(text, n=10):
    tokens = word_tokenize(text)
    fdist = FreqDist(tokens)
    most_common_words = fdist.most_common(n)
    return ', '.join([word[0] for word in most_common_words])

def compute_average_word_length(text):
    tokens = word_tokenize(text)
    word_lengths = [len(word) for word in tokens]
    return sum(word_lengths) / len(word_lengths)

In [6]:
def compute_average_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    return sum(sentence_lengths) / len(sentence_lengths)

def compute_sentiment_scores(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores['pos'], sentiment_scores['neg']

In [7]:
def compute_subjectivity_score(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores['compound']

def compute_percentage_complex_words(text):
    tokens = word_tokenize(text)
    complex_words = [word for word in tokens if len(word) > 2]  # You can customize the complexity criterion
    percentage_complex = (len(complex_words) / len(tokens)) * 100
    return percentage_complex

In [8]:
def compute_fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

def compute_complex_word_count(text):
    tokens = word_tokenize(text)
    complex_words = [word for word in tokens if len(word) > 2]  # You can customize the complexity criterion
    return len(complex_words)

In [9]:
def count_syllables(word):
    # Replace this with your custom code to count syllables in a word
    return 1

def compute_syllable_per_word(text):
    tokens = word_tokenize(text)
    total_syllables = sum(count_syllables(word) for word in tokens)
    return total_syllables / len(tokens)

def compute_personal_pronouns(text):
    personal_pronouns = ["I", "you", "he", "she", "we", "they", "me", "you", "him", "her", "us", "them"]
    tokens = word_tokenize(text)
    personal_pronoun_count = sum(1 for word in tokens if word.lower() in personal_pronouns)
    return personal_pronoun_count

In [11]:
input_file = "input.xlsx"
df = pd.read_excel(input_file)

# Process each URL and extract article text
output_data = []
for index, row in df.iterrows():
    url = row['URL']
    url_id = re.sub(r'\W+', '', url)  # Remove non-alphanumeric characters from the URL

    article_text = extract_article_text(url)
    word_count = compute_word_count(article_text)
    unique_word_count = compute_unique_word_count(article_text)
    stopword_count = compute_stopword_count(article_text)
    most_common_words = compute_most_common_words(article_text)
    average_word_length = compute_average_word_length(article_text)
    average_sentence_length = compute_average_sentence_length(article_text)
    positive_score, negative_score = compute_sentiment_scores(article_text)
    polarity_score = positive_score - negative_score
    subjectivity_score = compute_subjectivity_score(article_text)
    percentage_complex_words = compute_percentage_complex_words(article_text)
    fog_index = compute_fog_index(average_sentence_length, percentage_complex_words)
    complex_word_count = compute_complex_word_count(article_text)
    syllable_per_word = compute_syllable_per_word(article_text)
    personal_pronouns = compute_personal_pronouns(article_text)

    # Append the computed values to the output data
    output_data.append([
        url_id, word_count, unique_word_count, stopword_count, most_common_words, average_word_length,
        average_sentence_length, positive_score, negative_score, polarity_score, subjectivity_score,
        percentage_complex_words, fog_index, complex_word_count, syllable_per_word, personal_pronouns
    ])

# Create the output DataFrame and save it to output.xlsx
output_df = pd.DataFrame(output_data, columns=[
    'URL_ID', 'Word_Count', 'Unique_Word_Count', 'Stopword_Count', 'Most_Common_Words', 'Average_Word_Length',
    'Average_Sentence_Length', 'Positive_Score', 'Negative_Score', 'Polarity_Score', 'Subjectivity_Score',
    'Percentage_Complex_Words', 'FOG_Index', 'Complex_Word_Count', 'Syllable_Per_Word', 'Personal_Pronouns'
])

output_df.to_excel('output.xlsx', index=False)