In [82]:
import os
import pandas as pd
import textstat
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
import nltk
import re
import openpyxl


# define the input and output paths
input_dir = 'Input/'
output_file = 'output.xlsx'

# define the stop words
stop_words = set(nltk.corpus.stopwords.words('english'))

# add custom stopwords from files
stop_words_files = [
    "StopWords_Names.txt",
    "StopWords_Geographic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Generic.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Currencies.txt",
    "StopWords_Auditor.txt"
]

for filename in stop_words_files:
    with open(f"StopWords/{filename}", "r") as f:
        stop_words_custom = f.read().splitlines()
        stop_words.update(stop_words_custom)


# create a function to clean the text using stop words
def clean_text(article_text):
    words = word_tokenize(article_text)
    cleaned_words = [word for word in words if word.lower() not in stop_words and re.match(r'^[a-zA-Z]+', word)]
    return cleaned_words


# create the dictionary of positive and negative words
pos_words = set()
neg_words = set()

# read the positive and negative word files
with open("MasterDictionary/positive_words.txt", "r") as f:
    for line in f:
        word = line.strip().lower()
        if word not in stop_words:
            pos_words.add(word)

with open("MasterDictionary/negative_words.txt", "r") as f:
    for line in f:
        word = line.strip().lower()
        if word not in stop_words:
            neg_words.add(word)

# remove positive and negative words that are in stop words
pos_words = pos_words - stop_words
neg_words = neg_words - stop_words

# create dictionary of positive and negative words
word_dict = {'positive': pos_words, 'negative': neg_words}

# create an empty dataframe to store the results
columns = ['File Name', 'Positive Score', 'Negative Score', 'Polarity Score',
           'Subjectivity Score', 'Average Sentence Length', 'Percentage of Complex Words',
           'FOG Index', 'Average Number of Words per Sentence', 'Complex Word Count',
           'Word Count', 'Syllable Count per Word', 'Personal Pronouns', 'Average Word Length']
df_results = pd.DataFrame(columns=columns)

# get a list of all text files in the input directory
file_list = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.txt')]

# iterate through each file and extract the relevant data
# iterate through each file and extract the relevant data
for file_path in file_list:
    # read the input file
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # clean the text
    cleaned_words = clean_text(text)

    # calculate positive and negative scores
    pos_score = sum(1 for word in cleaned_words if word in word_dict['positive'])
    neg_score = sum(1 for word in cleaned_words if word in word_dict['negative'])

    # extract derived variables
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (len(cleaned_words) + 0.000001)
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
    word_count = len(words)
    complex_word_count = sum(1 for word in words if textstat.syllable_count(word) > 2)
    complex_word_percentage = complex_word_count / word_count * 100
    fog_index = 0.4 * (avg_sentence_length + complex_word_percentage)
    avg_words_per_sentence = word_count / len(sentences)
    syllable_count = sum(textstat.syllable_count(word) for word in words)
    syllable_count_per_word = syllable_count / word_count

    # extract personal pronouns
    personal_pronouns = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    personal_pronoun_count = sum(1 for word in words if word.lower() in personal_pronouns)

    # calculate average word length
    word_length_list = [len(word) for word in words]
    avg_word_length = sum(word_length_list) / word_count
    
   
    print()
    print("Sentiment Analysis:")
    print(f"Positive Score: {pos_score}")
    print(f"Negative Score: {neg_score}")
    print(f"Polarity Score: {polarity_score}")
    print(f"Subjectivity Score: {subjectivity_score}")
    print()
    print("Readability Analysis:")
    print(f"Average Sentence Length: {avg_sentence_length}")
    print(f"Percentage of Complex words: {perc_complex_words}")
    print(f"Fog Index: {fog_index}")
    print(f"Average Number of Words Per Sentence: {num_words/num_sentences}")
    print(f"Complex Word Count: {num_complex_words}")
    print(f"Word Count: {num_words}")
    print(f"Syllable Count Per Word: {num_syllables/num_words}")
    print(f"Personal Pronouns: {personal_pronoun_count}")
    print(f"Average Word Length: {avg_word_length}")

    # add the results to the dataframe
    filename = os.path.basename(file_path)
    df_results.loc[len(df_results)] = [filename, pos_score, neg_score, polarity_score, subjectivity_score, avg_sentence_length, complex_word_percentage, fog_index, avg_words_per_sentence, complex_word_count, word_count, syllable_count_per_word, personal_pronoun_count, avg_word_length]
    
print("Analysis complete. Output saved to output.xlsx file.")

# write the results to the output file
df_results.to_excel(output_file, index=False)





Sentiment Analysis:
Positive Score: 21
Negative Score: 33
Polarity Score: -0.22222221810699597
Subjectivity Score: 0.07133421390840923

Readability Analysis:
Average Sentence Length: 22.733333333333334
Percentage of Complex words: 0.5779735682819384
Fog Index: 15.121430395913155
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 5
Average Word Length: 4.773946360153257

Sentiment Analysis:
Positive Score: 7
Negative Score: 3
Polarity Score: 0.39999996000000404
Subjectivity Score: 0.0510204079029571

Readability Analysis:
Average Sentence Length: 26.428571428571427
Percentage of Complex words: 0.5779735682819384
Fog Index: 15.583476764199657
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 5
Average Word Length: 4.703614457831326

Sentiment Analysis:
Positive Score: 31



Sentiment Analysis:
Positive Score: 36
Negative Score: 61
Polarity Score: -0.25773195610585614
Subjectivity Score: 0.13896848117626293

Readability Analysis:
Average Sentence Length: 26.191176470588236
Percentage of Complex words: 0.5779735682819384
Fog Index: 14.445301757066463
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 57
Average Word Length: 4.330909090909091

Sentiment Analysis:
Positive Score: 35
Negative Score: 65
Polarity Score: -0.299999997
Subjectivity Score: 0.10964912268678824

Readability Analysis:
Average Sentence Length: 20.933333333333334
Percentage of Complex words: 0.5779735682819384
Fog Index: 13.568138528138528
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 16
Average Word Length: 4.547773654916512

Sentiment Analysis:
Positive Score: 40
Ne


Sentiment Analysis:
Positive Score: 8
Negative Score: 25
Polarity Score: -0.5151514995408637
Subjectivity Score: 0.07819905194739561

Readability Analysis:
Average Sentence Length: 20.41025641025641
Percentage of Complex words: 0.5779735682819384
Fog Index: 14.120574499955945
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 3
Average Word Length: 4.859106529209622

Sentiment Analysis:
Positive Score: 3
Negative Score: 0
Polarity Score: 0.9999996666667778
Subjectivity Score: 0.023809523620559336

Readability Analysis:
Average Sentence Length: 32.166666666666664
Percentage of Complex words: 0.5779735682819384
Fog Index: 19.25936073059361
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 2
Average Word Length: 5.178082191780822

Sentiment Analysis:
Positive Score: 14
Neg


Sentiment Analysis:
Positive Score: 38
Negative Score: 19
Polarity Score: 0.33333332748538025
Subjectivity Score: 0.07894736831170725

Readability Analysis:
Average Sentence Length: 21.666666666666668
Percentage of Complex words: 0.5779735682819384
Fog Index: 15.66931918656057
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 13
Average Word Length: 5.061007957559681

Sentiment Analysis:
Positive Score: 30
Negative Score: 24
Polarity Score: 0.11111110905349798
Subjectivity Score: 0.07541899430807403

Readability Analysis:
Average Sentence Length: 16.24418604651163
Percentage of Complex words: 0.5779735682819384
Fog Index: 12.123448891962768
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 9
Average Word Length: 4.563816604708798

Sentiment Analysis:
Positive Score: 65


Sentiment Analysis:
Positive Score: 37
Negative Score: 25
Polarity Score: 0.19354838397502608
Subjectivity Score: 0.07898089161913262

Readability Analysis:
Average Sentence Length: 18.850574712643677
Percentage of Complex words: 0.5779735682819384
Fog Index: 12.197327173733708
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 35
Average Word Length: 4.440723019670388

Sentiment Analysis:
Positive Score: 27
Negative Score: 7
Polarity Score: 0.5882352768166096
Subjectivity Score: 0.07623318368557583

Readability Analysis:
Average Sentence Length: 15.597014925373134
Percentage of Complex words: 0.5779735682819384
Fog Index: 10.245597141626334
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 3
Average Word Length: 4.313242784380305

Sentiment Analysis:
Positive Score: 42


Sentiment Analysis:
Positive Score: 46
Negative Score: 14
Polarity Score: 0.5333333244444446
Subjectivity Score: 0.06688963203245303

Readability Analysis:
Average Sentence Length: 19.291666666666668
Percentage of Complex words: 0.5779735682819384
Fog Index: 12.94308176100629
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 8
Average Word Length: 4.545754716981132

Sentiment Analysis:
Positive Score: 31
Negative Score: 30
Polarity Score: 0.016393442354205864
Subjectivity Score: 0.09682539667170573

Readability Analysis:
Average Sentence Length: 24.352941176470587
Percentage of Complex words: 0.5779735682819384
Fog Index: 14.798647734956052
Average Number of Words Per Sentence: 31.833333333333332
Complex Word Count: 969
Word Count: 1337
Syllable Count Per Word: 2.418100224382947
Personal Pronouns: 9
Average Word Length: 4.673132183908046

Sentiment Analysis:
Positive Score: 18