# Data Analysis 

In [26]:
import pandas as pd
import os 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [27]:
result_df = pd.read_excel('extracteddata.xlsx')

In [28]:
# Fill missing values with an empty string
result_df['Title'] = result_df['Title'].fillna('')
result_df['Text'] = result_df['Text'].fillna('')

# Now combine 'Title' and 'Text' into a new column 'Content'
result_df['Content'] = result_df['Title'] + ' ' + result_df['Text']

# Drop the original 'Title' and 'Text' columns
result_df = result_df.drop(columns=['Title', 'Text'])

In [29]:
result_df.head()

Unnamed: 0,URL_ID,Content
0,blackassign0001,Rising IT cities and its impact on the economy...
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp..."
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...
4,blackassign0005,OTT platform and its impact on the entertainme...


# Sentimental Analysis

In [30]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Cleaning using custom Stop Words Lists

In [31]:
def load_custom_stopwords(folder_path=r'C:\Users\Dell\TestAssignment\StopWords'):
    stopwords = set()
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                stopwords.update(line.strip() for line in file)
    return stopwords

In [32]:
def clean_text_with_custom_stopwords(text, stopwords):
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stop words
    cleaned_words = [word for word in words if word.lower() not in stopwords]
    
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    
    return cleaned_text

In [33]:
# Load the custom stop words
custom_stopwords = load_custom_stopwords()

# Apply the cleaning function to each row in the DataFrame
result_df['Content_Without_Stopwords'] = result_df['Content'].apply(lambda x: clean_text_with_custom_stopwords(x, custom_stopwords))

In [34]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i..."
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I..."
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im..."
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ..."
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...


## Creating a dictionary of Positive and Negative words

In [35]:
def load_master_dictionary(positive_file=r"C:\Users\Dell\TestAssignment\positive-words.txt", negative_file=r"C:\Users\Dell\TestAssignment\negative-words.txt"):
    # Function body remains the same

    def load_dictionary(file_path):
        with open(file_path, 'r') as file:
            words = file.read().splitlines()
        return set(words)
    
    positive_words = load_dictionary(positive_file)
    negative_words = load_dictionary(negative_file)
    
    return positive_words, negative_words

In [36]:
def filter_stopwords(positive_words, negative_words):
    # Load the custom stop words
    custom_stopwords = load_custom_stopwords()
    
    # Filter out the custom stop words from the positive and negative words dictionaries
    filtered_positive_words = positive_words - custom_stopwords
    filtered_negative_words = negative_words - custom_stopwords
    
    return filtered_positive_words, filtered_negative_words

In [37]:
# Load the Master Dictionary
positive_words, negative_words = load_master_dictionary()

# Filter out stopwords
filtered_positive_words, filtered_negative_words = filter_stopwords(positive_words, negative_words)

# Now, filtered_positive_words and filtered_negative_words are your dictionaries of positive and negative words, excluding stopwords.

## Extracting Derived variables

In [38]:
def calculate_derived_variables(text, positive_words, negative_words):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Calculate Positive and Negative Scores
    positive_score = sum(1 for word in tokens if word.lower() in positive_words)
    negative_score = sum(-1 for word in tokens if word.lower() in negative_words)
    
    # Multiply the negative score by -1 to ensure it is a positive number
    negative_score = -1 * negative_score
    
    # Calculate Polarity Score
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    
    # Calculate Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score


In [39]:
# Assuming positive_words and negative_words are already defined and loaded
result_df['POSITIVE SCORE'], result_df['NEGATIVE SCORE'], result_df['POLARITY SCORE'], result_df['SUBJECTIVITY SCORE'] = zip(*result_df['Content_Without_Stopwords'].apply(lambda x: calculate_derived_variables(x, positive_words, negative_words)))

In [40]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514


# Analysis of Readability and Complex Word Count

In [41]:
def count_syllables(word):
    count = 0
    vowels = "aeiouy"
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def calculate_complex_word_count_and_metrics(text):
    # Tokenize words
    words = word_tokenize(text)
    
    # Count complex words
    complex_word_count = sum(1 for word in words if count_syllables(word) > 2)
    
    # Calculate Average Sentence Length
    sentences = sent_tokenize(text)
    avg_sentence_length = len(words) / len(sentences)
    
    # Calculate Percentage of Complex words
    percentage_of_complex_words = (complex_word_count / len(words)) * 100
    
    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
    
    return complex_word_count, percentage_of_complex_words, fog_index, avg_sentence_length

In [42]:
result_df['AVE SENTENCE LENGTH'], result_df['PERCENTAGE OF COMPLEX WORDS'], result_df['FOG INDEX'], result_df['COMPLEX WORD COUNT'] = zip(*result_df['Content_Without_Stopwords'].apply(calculate_complex_word_count_and_metrics))

In [43]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429


# Average Number of Words Per Sentence

In [44]:
def calculate_average_words_per_sentence(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Tokenize words
    words = word_tokenize(text)
    
    # Calculate Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)
    
    return avg_words_per_sentence

In [45]:
result_df['AVG NUMBER OF WORDS PER SENTENCE'] = result_df['Content_Without_Stopwords'].apply(calculate_average_words_per_sentence)

In [46]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,AVG NUMBER OF WORDS PER SENTENCE
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667,11.416667
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625,15.5625
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696,16.608696
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474,19.789474
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429,15.571429


# Word Count

In [47]:
import string

def calculate_word_count(text, custom_stopwords):
    # Tokenize words
    words = word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    words = [''.join(c for c in word if c not in string.punctuation) for word in words]
    words = [word.lower() for word in words]
    
    # Remove custom stopwords
    words = [word for word in words if word not in custom_stopwords]
    
    # Count words
    word_count = len(words)
    
    return word_count

In [48]:
# Load the custom stop words
custom_stopwords = load_custom_stopwords()

# Apply the cleaning function to each row in the DataFrame
result_df['WORD COUNT'] = result_df['Content_Without_Stopwords'].apply(lambda x: calculate_word_count(x, custom_stopwords))

In [49]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,AVG NUMBER OF WORDS PER SENTENCE,WORD COUNT
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667,11.416667,684
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625,15.5625,996
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696,16.608696,764
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474,19.789474,752
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429,15.571429,436


# Syllable Count Per Word

In [50]:
def count_syllables(word):
    count = 0
    vowels = "aeiouy"
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith(("es", "ed")):
        count -= 1
    if count == 0:
        count += 1
    return count

def calculate_syllable_count_per_word(text):
    # Tokenize words
    words = word_tokenize(text)
    
    # Calculate syllable count for each word
    syllable_counts = [count_syllables(word) for word in words]
    
    return syllable_counts

In [51]:
result_df['SYLLABLE PER WORD'] = result_df['Content_Without_Stopwords'].apply(calculate_syllable_count_per_word)

In [52]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,AVG NUMBER OF WORDS PER SENTENCE,WORD COUNT,SYLLABLE PER WORD
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667,11.416667,684,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 1, 1, 1, 2, ..."
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625,15.5625,996,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 3, 3, 1, 3, ..."
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696,16.608696,764,"[3, 2, 1, 4, 1, 5, 2, 1, 1, 1, 5, 2, 5, 1, 2, ..."
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474,19.789474,752,"[2, 4, 2, 3, 3, 2, 1, 1, 1, 5, 5, 1, 2, 2, 4, ..."
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429,15.571429,436,"[1, 2, 2, 4, 3, 3, 1, 1, 1, 1, 2, 2, 4, 1, 4, ..."


# Personal Pronouns

In [53]:
import re

def calculate_personal_pronouns(text):
    # Define the regex pattern for personal pronouns
    pattern = r'\b(I|we|my|ours|us)\b'
    
    # Ensure "US" is not matched
    pattern = r"(?<!\bUS\b)" + pattern
    
    # Find all matches in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    count = len(matches)
    
    return count

In [54]:
result_df['PERSONAL PRONOUNS'] = result_df['Content_Without_Stopwords'].apply(calculate_personal_pronouns)

In [55]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,AVG NUMBER OF WORDS PER SENTENCE,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667,11.416667,684,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 1, 1, 1, 2, ...",1
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625,15.5625,996,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 3, 3, 1, 3, ...",0
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696,16.608696,764,"[3, 2, 1, 4, 1, 5, 2, 1, 1, 1, 5, 2, 5, 1, 2, ...",0
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474,19.789474,752,"[2, 4, 2, 3, 3, 2, 1, 1, 1, 5, 5, 1, 2, 2, 4, ...",0
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429,15.571429,436,"[1, 2, 2, 4, 3, 3, 1, 1, 1, 1, 2, 2, 4, 1, 4, ...",1


In [56]:
def calculate_average_word_length(text):
    # Tokenize words
    words = word_tokenize(text)
    
    # Calculate the sum of the total number of characters in each word
    total_characters = sum(len(word) for word in words)
    
    # Calculate the total number of words
    total_words = len(words)
    
    # Calculate the average word length
    average_word_length = total_characters / total_words if total_words > 0 else 0
    
    return average_word_length

In [57]:
result_df['AVG WORD LENGTH'] = result_df['Content_Without_Stopwords'].apply(calculate_average_word_length)

In [58]:
result_df.head()

Unnamed: 0,URL_ID,Content,Content_Without_Stopwords,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,AVG NUMBER OF WORDS PER SENTENCE,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,Rising IT cities and its impact on the economy...,"Rising cities impact economy , environment , i...",33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667,11.416667,684,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 1, 1, 1, 2, ...",1,5.732847
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Rising Cities Impact Economy , Environment , I...",60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625,15.5625,996,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 3, 3, 1, 3, ...",0,6.233936
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","Internet Demand ’ Evolution , Communication Im...",38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696,16.608696,764,"[3, 2, 1, 4, 1, 5, 2, 1, 1, 1, 5, 2, 5, 1, 2, ...",0,7.085079
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"Rise Cybercrime Effect upcoming Future live , ...",37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474,19.789474,752,"[2, 4, 2, 3, 3, 2, 1, 1, 1, 5, 5, 1, 2, 2, 4, ...",0,6.844415
4,blackassign0005,OTT platform and its impact on the entertainme...,OTT platform impact entertainment industry Fut...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429,15.571429,436,"[1, 2, 2, 4, 3, 3, 1, 1, 1, 1, 2, 2, 4, 1, 4, ...",1,6.715596


In [59]:
import pandas as pd

# Assuming result_df is your DataFrame
result_df = result_df.drop(columns=['Content', 'Content_Without_Stopwords'])
result_df = result_df.reset_index(drop=True)

In [60]:
# Load the Excel file
input_df = pd.read_excel('Input.xlsx')

result_df['URL'] = input_df.iloc[:, 1]

In [61]:
url_column = result_df.pop('URL')
result_df.insert(1, 'URL', url_column)

In [62]:
cwc_column = result_df.pop('COMPLEX WORD COUNT')
result_df.insert(10, 'COMPLEX WORD COUNT', cwc_column)

In [63]:
result_df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVE SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,33,5,0.736842,0.055474,165,24.087591,14.201703,11.416667,11.416667,684,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 1, 1, 1, 2, ...",1,5.732847
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,60,29,0.348315,0.089357,336,33.73494,19.718976,15.5625,15.5625,996,"[2, 1, 2, 4, 1, 4, 1, 5, 1, 2, 2, 3, 3, 1, 3, ...",0,6.233936
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,38,24,0.225806,0.081152,334,43.717277,24.130389,16.608696,16.608696,764,"[3, 2, 1, 4, 1, 5, 2, 1, 1, 1, 5, 2, 5, 1, 2, ...",0,7.085079
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,37,72,-0.321101,0.144947,311,41.356383,24.458343,19.789474,19.789474,752,"[2, 4, 2, 3, 3, 2, 1, 1, 1, 5, 5, 1, 2, 2, 4, ...",0,6.844415
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,21,8,0.448276,0.066514,151,34.633028,20.081782,15.571429,15.571429,436,"[1, 2, 2, 4, 3, 3, 1, 1, 1, 1, 2, 2, 4, 1, 4, ...",1,6.715596


In [64]:
result_df.to_excel('OutputData.xlsx', index=False)