In [1]:
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
import re
import pandas as pd

def load_stopwords(stopwords_folder):
    stop_words = set()
    for file in os.listdir(stopwords_folder):
        with open(os.path.join(stopwords_folder, file), 'r') as f:
            words = f.read().splitlines()
            stop_words.update(words)
    return stop_words

def clean_text(text, stopwords):
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.lower() not in stopwords]
    return cleaned_words

def create_dictionary(master_dict, stopwords):
    pos_words = []
    neg_words = []
    
    with open(os.path.join(master_dict, 'positive-words.txt'), 'r') as pos_file:
        pos_words = [word.strip() for word in pos_file if word.strip() not in stopwords]

    with open(os.path.join(master_dict, 'negative-words.txt'), 'r') as neg_file:
        neg_words = [word.strip() for word in neg_file if word.strip() not in stopwords]

    return pos_words, neg_words

def calculate_scores(cleaned_text, pos_words, neg_words):
    positive_score = sum(1 for word in cleaned_text if word in pos_words)
    negative_score = sum(1 for word in cleaned_text if word in neg_words)
    
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score )+ 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_text) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

def calculate_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    
    # Average Sentence Length
    avg_sentence_length = len(words) / len(sentences)
    
    # Complex Word Count
    d = cmudict.dict()
    complex_words = [word for word in words if word.lower() in d and len(d[word.lower()]) > 2]
    
    # Percentage of Complex words
    percentage_complex_words = len(complex_words) / len(words)
    
    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)
    
    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, len(complex_words)

"""
def count_syllables(word):
    d = cmudict.dict()
    word = word.lower()
    if word in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word]][0]
    # Handling words ending with "es" and "ed"
    if word.endswith('es') or word.endswith('ed'):
        return count_syllables(word[:-2])
    # Words that are not found in the CMU dictionary
    return max(1, len(re.findall(r'[aeiouy]+', word)))

def calculate_syllable_count(cleaned_text):
    syllable_count = sum(count_syllables(word) for word in cleaned_text)
    return syllable_count
"""
def count_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    return len(personal_pronouns)

def calculate_average_word_length(cleaned_text):
    total_chars = sum(len(word) for word in cleaned_text)
    avg_word_length = total_chars / len(cleaned_text)
    return avg_word_length

# Example usage
stopwords_folder = "StopWords"
master_dict_folder = "MasterDictionary" 
text_file = "txt/123.txt" 

with open(text_file, 'r') as file:
    text = file.read()

stopwords = load_stopwords(stopwords_folder)
cleaned_text = clean_text(text, stopwords)
positive_words, negative_words = create_dictionary(master_dict_folder, stopwords)
pos_score, neg_score, polarity, subjectivity = calculate_scores(cleaned_text, positive_words, negative_words)

avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count = calculate_readability(text)
#syllable_count = calculate_syllable_count(cleaned_text)
personal_pronouns_count = count_personal_pronouns(text)
average_word_length = calculate_average_word_length(cleaned_text)

print("Positive Score:", pos_score)
print("Negative Score:", neg_score)
print("Polarity Score:", polarity)
print("Subjectivity Score:", subjectivity)

print("\nReadability Analysis:")
print(f"Average Sentence Length: {avg_sentence_length}")
print(f"Percentage of Complex Words: {percentage_complex_words}")
print(f"Fog Index: {fog_index}")
print(f"Average Number of Words Per Sentence: {avg_words_per_sentence}")
print(f"Complex Word Count: {complex_word_count}")

print("\nAdditional Analysis:")
#print(f"Total Syllable Count: {syllable_count}")
print("Total Syllable Count: Skipped")
print(f"Personal Pronouns Count: {personal_pronouns_count}")
print(f"Average Word Length: {average_word_length}")

Positive Score: 79
Negative Score: 24
Polarity Score: 0.5339805773399944
Subjectivity Score: 0.09932497579621506

Readability Analysis:
Average Sentence Length: 23.15
Percentage of Complex Words: 0.13930885529157666
Fog Index: 9.315723542116631
Average Number of Words Per Sentence: 23.15
Complex Word Count: 258

Additional Analysis:
Total Syllable Count: Skipped
Personal Pronouns Count: 2
Average Word Length: 6.668273866923819


In [3]:
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
import re
import pandas as pd

def load_stopwords(stopwords_folder):
    stop_words = set()
    for file in os.listdir(stopwords_folder):
        with open(os.path.join(stopwords_folder, file), 'r') as f:
            words = f.read().splitlines()
            stop_words.update(words)
    return stop_words

def clean_text(text, stopwords):
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.lower() not in stopwords]
    return cleaned_words

def create_dictionary(master_dict, stopwords):
    pos_words = []
    neg_words = []
    
    with open(os.path.join(master_dict, 'positive-words.txt'), 'r') as pos_file:
        pos_words = [word.strip() for word in pos_file if word.strip() not in stopwords]

    with open(os.path.join(master_dict, 'negative-words.txt'), 'r') as neg_file:
        neg_words = [word.strip() for word in neg_file if word.strip() not in stopwords]

    return pos_words, neg_words

def calculate_scores(cleaned_text, pos_words, neg_words):
    positive_score = sum(1 for word in cleaned_text if word in pos_words)
    negative_score = sum(1 for word in cleaned_text if word in neg_words)
    
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score )+ 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_text) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

def calculate_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    
    # Average Sentence Length
    avg_sentence_length = len(words) / len(sentences)
    
    # Complex Word Count
    d = cmudict.dict()
    complex_words = [word for word in words if word.lower() in d and len(d[word.lower()]) > 2]
    
    # Percentage of Complex words
    percentage_complex_words = len(complex_words) / len(words)
    
    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)
    
    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, len(complex_words)

"""
def count_syllables(word):
    d = cmudict.dict()
    word = word.lower()
    if word in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word]][0]
    # Handling words ending with "es" and "ed"
    if word.endswith('es') or word.endswith('ed'):
        return count_syllables(word[:-2])
    # Words that are not found in the CMU dictionary
    return max(1, len(re.findall(r'[aeiouy]+', word)))

def calculate_syllable_count(cleaned_text):
    syllable_count = sum(count_syllables(word) for word in cleaned_text)
    return syllable_count
"""
def count_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    return len(personal_pronouns)

def calculate_average_word_length(cleaned_text):
    total_chars = sum(len(word) for word in cleaned_text)
    avg_word_length = total_chars / len(cleaned_text)
    return avg_word_length

# Example usage
stopwords_folder = "StopWords"
master_dict_folder = "MasterDictionary" 
text_file = "txt/123.txt" 

with open(text_file, 'r') as file:
    text = file.read()

stopwords = load_stopwords(stopwords_folder)
cleaned_text = clean_text(text, stopwords)
positive_words, negative_words = create_dictionary(master_dict_folder, stopwords)
pos_score, neg_score, polarity, subjectivity = calculate_scores(cleaned_text, positive_words, negative_words)

avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count = calculate_readability(text)
#syllable_count = calculate_syllable_count(cleaned_text)
personal_pronouns_count = count_personal_pronouns(text)
average_word_length = calculate_average_word_length(cleaned_text)

print("Positive Score:", pos_score)
print("Negative Score:", neg_score)
print("Polarity Score:", polarity)
print("Subjectivity Score:", subjectivity)

print("\nReadability Analysis:")
print(f"Average Sentence Length: {avg_sentence_length}")
print(f"Percentage of Complex Words: {percentage_complex_words}")
print(f"Fog Index: {fog_index}")
print(f"Average Number of Words Per Sentence: {avg_words_per_sentence}")
print(f"Complex Word Count: {complex_word_count}")

print("\nAdditional Analysis:")
#print(f"Total Syllable Count: {syllable_count}")
print("Total Syllable Count: Skipped")
print(f"Personal Pronouns Count: {personal_pronouns_count}")
print(f"Average Word Length: {average_word_length}")

Positive Score: 79
Negative Score: 24
Polarity Score: 0.5339805773399944
Subjectivity Score: 0.09932497579621506

Readability Analysis:
Average Sentence Length: 23.15
Percentage of Complex Words: 0.13930885529157666
Fog Index: 9.315723542116631
Average Number of Words Per Sentence: 23.15
Complex Word Count: 258

Additional Analysis:
Total Syllable Count: Skipped
Personal Pronouns Count: 2
Average Word Length: 6.668273866923819


In [11]:
d = {"POSITIVE SCORE":[], "NEGATIVE SCORE":[], "POLARITY SCORE":[], "SUBJECTIVITY SCORE":[], "AVG SENTENCE LENGTH":[], "PERCENTAGE OF COMPLEX WORDS":[], "FOG INDEX":[], "AVG NUMBER OF WORDS PER SENTENCE":[], "COMPLEX WORD COUNT":[], "WORD COUNT":[], "SYLLABLE PER WORD":[], "PERSONAL PRONOUNS":[], "AVG WORD LENGTH":[]}

d['POSITIVE SCORE'].append(1)
#d

values_count = len(d['POSITIVE SCORE'])
keys = d.keys()
for key in keys:
    while len(d[key]) < values_count:
        d[key].append(None) 
        
coffe_df = pd.DataFrame.from_dict(d)

In [12]:
coffe_df

Unnamed: 0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1,,,,,,,,,,,,
