In [1]:
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict,stopwords
import re
import pandas as pd
# Download the NLTK resources if you haven't already
#nltk.download('stopwords')
#nltk.download('punkt')


def load_stopwords(stopwords_folder):
    stop_words = set()
    for file in os.listdir(stopwords_folder):
        with open(os.path.join(stopwords_folder, file), 'r') as f:
            words = f.read().splitlines()
            stop_words.update(words)
    return stop_words

def clean_text(text, stopwords):
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.lower() not in stopwords]
    return cleaned_words

def create_dictionary(master_dict, stopwords):
    pos_words = []
    neg_words = []
    
    with open(os.path.join(master_dict, 'positive-words.txt'), 'r') as pos_file:
        pos_words = [word.strip() for word in pos_file if word.strip() not in stopwords]

    with open(os.path.join(master_dict, 'negative-words.txt'), 'r') as neg_file:
        neg_words = [word.strip() for word in neg_file if word.strip() not in stopwords]

    return pos_words, neg_words

def calculate_scores(cleaned_text, pos_words, neg_words):
    positive_score = sum(1 for word in cleaned_text if word in pos_words)
    negative_score = sum(1 for word in cleaned_text if word in neg_words)
    
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score )+ 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_text) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

def calculate_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    
    # Average Sentence Length
    avg_sentence_length = len(words) / len(sentences)
    
    # Complex Word Count
    d = cmudict.dict()
    complex_words = [word for word in words if word.lower() in d and len(d[word.lower()]) > 2]
    
    # Percentage of Complex words
    percentage_complex_words = len(complex_words) / len(words)
    
    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)
    
    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, len(complex_words)

def count_words(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Remove punctuation
    words = [word.lower() for word in words if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Calculate word count
    word_count = len(words)

    # Calculate syllable count per word
    def count_syllables(word):
        vowels = 'aeiouy'
        count = 0
        word = word.lower()
        if word.endswith(('es', 'ed')):
            return 0  # exceptions for words ending with "es" or "ed"
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith('e'):
            count -= 1  # excluding words ending with 'e'
        if count == 0:
            count = 1  # each word should have at least one syllable
        return count

    syllable_count_per_word = [count_syllables(word) for word in words]

    return word_count, syllable_count_per_word

def count_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(?:I|we|my|ours|and|us)\b', text, flags=re.IGNORECASE)
    return len(personal_pronouns)

def calculate_average_word_length(cleaned_text):
    total_chars = sum(len(word) for word in cleaned_text)
    avg_word_length = total_chars / len(cleaned_text)
    return avg_word_length

# Example usage
def process_text_files_in_folder(folder_path):
    stopwords_folder = "StopWords"
    master_dict_folder = "MasterDictionary"

    stopwords = load_stopwords(stopwords_folder)
    positive_words, negative_words = create_dictionary(master_dict_folder, stopwords)

    results = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                text = file.read()

            cleaned_text = clean_text(text, stopwords)
            pos_score, neg_score, polarity, subjectivity = calculate_scores(cleaned_text, positive_words, negative_words)

            avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count = calculate_readability(text)
            personal_pronouns_count = count_personal_pronouns(text)
            word_count, syllable_count_per_word = count_words(text)
            avg_syllable_count = sum(syllable_count_per_word) / word_count if word_count > 0 else 0
            average_word_length = calculate_average_word_length(cleaned_text)

            # Store the results in a dictionary for each file
            result = {
                'URL_ID': file_name[: -len(".txt")],
                'POSITIVE SCORE': pos_score,
                'NEGATIVE SCORE': neg_score,
                'POLARITY SCORE': polarity,
                'SUBJECTIVITY SCORE': subjectivity,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog_index,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': word_count,
                'SYLLABLE PER WORD':avg_syllable_count,
                'PERSONAL PRONOUNS': personal_pronouns_count,
                'AVG WORD LENGTH': average_word_length
            }
            results.append(result)

    return results

In [2]:
# Example usage for processing a folder of text files
folder_path = "txt"
results = process_text_files_in_folder(folder_path)

# Convert the results to a DataFrame for better visualization
df = pd.DataFrame(results)
print(df)

      URL_ID  POSITIVE SCORE  NEGATIVE SCORE  POLARITY SCORE  \
0    10282.6              61              25        0.418605   
1    10744.4              46              23        0.333333   
2    11206.2              27              12        0.384615   
3      11668              27              12        0.384615   
4    12129.8              38              13        0.490196   
..       ...             ...             ...             ...   
109   7973.6              35              26        0.147541   
110   8435.4              35              26        0.147541   
111   8897.2              60              38        0.224490   
112     9359              66              38        0.269231   
113   9820.8              70              28        0.428571   

     SUBJECTIVITY SCORE  AVG SENTENCE LENGTH  PERCENTAGE OF COMPLEX WORDS  \
0              0.081285            24.850000                     0.093561   
1              0.088348            23.107692                     0.098535   


In [3]:
df.to_excel("coffer.xlsx", header=True, index=False)