In [1]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textstat import flesch_reading_ease, flesch_kincaid_grade

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')

# Load stop words
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    tokens = word_tokenize(text)
    cleaned_tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return cleaned_tokens

# Function to count syllables in a word
def syllable_count(word):
    d = nltk.corpus.cmudict.dict()
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        # Simple fallback for words not found in CMU Pronouncing Dictionary
        return max(1, len(word) // 3)

# Function to calculate sentiment scores
def calculate_sentiment_scores(tokens, positive_words, negative_words):
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

# Load positive and negative words
positive_words_file = "C:\\Users\\pooja\\Downloads\\positive-words.txt"
negative_words_file = "C:\\Users\\pooja\\Downloads\\negative-words.txt"

positive_words = set()
negative_words = set()

# Read positive words from file
with open(positive_words_file, 'r') as file:
    for line in file:
        word = line.strip()
        positive_words.add(word)

# Read negative words from file
with open(negative_words_file, 'r') as file:
    for line in file:
        word = line.strip()
        negative_words.add(word)

# Replace "extracted_articles" with the path to your directory containing extracted articles
directory_path = "C:\\Users\\pooja\\Downloads\\extracted_articles"

# Process each text file
for filename in os.listdir(directory_path):
    with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
        text = file.read()
        cleaned_tokens = clean_text(text)
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(cleaned_tokens, positive_words, negative_words)
        
        # Additional features
        sentences = sent_tokenize(text)
        word_count = len(cleaned_tokens)
        avg_sentence_length = word_count / len(sentences)
        complex_word_count = sum(1 for word in cleaned_tokens if len(word) > 2)  # Considering words with more than 2 characters as complex
        percentage_complex_words = (complex_word_count / word_count) * 100
        
        # Calculate FOG Index
        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
        
        print(f"File: {filename}")
        print(f"Average Sentence Length: {avg_sentence_length}")
        print(f"FOG Index: {fog_index}")
        print(f"Word Count: {word_count}")
        print("------------------")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


File: blackassign0001.txt
Average Sentence Length: 10.833333333333334
FOG Index: 43.96410256410257
Word Count: 325
------------------
File: blackassign0002.txt
Average Sentence Length: 11.78048780487805
FOG Index: 44.46374791698228
Word Count: 966
------------------
File: blackassign0003.txt
Average Sentence Length: 12.918032786885245
FOG Index: 45.01492885079472
Word Count: 788
------------------
File: blackassign0004.txt
Average Sentence Length: 14.178571428571429
FOG Index: 45.419539402662835
Word Count: 794
------------------
File: blackassign0005.txt
Average Sentence Length: 12.25
FOG Index: 44.751576994434146
Word Count: 539
------------------
File: blackassign0006.txt
Average Sentence Length: 13.46236559139785
FOG Index: 44.96961077330036
Word Count: 1252
------------------
File: blackassign0007.txt
Average Sentence Length: 10.619047619047619
FOG Index: 43.82908392056375
Word Count: 669
------------------
File: blackassign0008.txt
Average Sentence Length: 11.76923076923077
FOG I

In [2]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')

# Load stop words
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    tokens = word_tokenize(text)
    cleaned_tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return cleaned_tokens

# Function to count syllables in a word
def syllable_count(word):
    d = nltk.corpus.cmudict.dict()
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        # Simple fallback for words not found in CMU Pronouncing Dictionary
        return max(1, len(word) // 3)

# Replace "extracted_articles" with the path to your directory containing extracted articles
directory_path = "C:\\Users\\pooja\\Downloads\\extracted_articles"

# Process each text file
for filename in os.listdir(directory_path):
    with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
        text = file.read()
        cleaned_tokens = clean_text(text)
        
        # Additional features
        sentences = sent_tokenize(text)
        word_count = len(cleaned_tokens)
        avg_sentence_length = word_count / len(sentences)
        complex_word_count = sum(1 for word in cleaned_tokens if len(word) > 2)  # Considering words with more than 2 characters as complex
        percentage_complex_words = (complex_word_count / word_count) * 100
        
        print(f"File: {filename}")
        print(f"Percentage of Complex Words: {percentage_complex_words}")
        print(f"Avg Number of Words per Sentence: {avg_sentence_length}")
        print(f"Complex Word Count: {complex_word_count}")
        print("------------------")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


File: blackassign0001.txt
Percentage of Complex Words: 99.07692307692308
Avg Number of Words per Sentence: 10.833333333333334
Complex Word Count: 322
------------------
File: blackassign0002.txt
Percentage of Complex Words: 99.37888198757764
Avg Number of Words per Sentence: 11.78048780487805
Complex Word Count: 960
------------------
File: blackassign0003.txt
Percentage of Complex Words: 99.61928934010153
Avg Number of Words per Sentence: 12.918032786885245
Complex Word Count: 785
------------------
File: blackassign0004.txt
Percentage of Complex Words: 99.37027707808565
Avg Number of Words per Sentence: 14.178571428571429
Complex Word Count: 789
------------------
File: blackassign0005.txt
Percentage of Complex Words: 99.62894248608535
Avg Number of Words per Sentence: 12.25
Complex Word Count: 537
------------------
File: blackassign0006.txt
Percentage of Complex Words: 98.96166134185303
Avg Number of Words per Sentence: 13.46236559139785
Complex Word Count: 1239
------------------
