In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


In [None]:
import pandas as pd
data = pd.read_csv('input.csv')
data.head()

In [None]:
import requests
from bs4 import BeautifulSoup

def url_to_transcript(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')

    # Find the article title
    title_tag = soup.find('h1')
    if title_tag:
        title = title_tag.text.strip()
    else:
        title = None

    parent_element = soup.find('div', class_='td-post-content')

    if parent_element:
        paragraphs = parent_element.find_all('p')

        article_text = ''
        if title:
            article_text += title + '\n\n'

        for p in paragraphs:
            article_text += p.text.strip() + '\n'
        return article_text
    else:
        print("Error: Parent element containing main content not found.")
        return None


In [None]:
transcripts = []
for u in data.URL:
    transcript = url_to_transcript(u)
    if transcript:
        transcripts.append(transcript)

In [None]:
import pickle
!mkdir transcripts

# Assuming transcripts is a list of transcripts
for url_id, transcript in zip(data.URL_ID, transcripts):
    if transcript:
        # Serialize the transcript using pickle.dump
        with open(f"transcripts/{url_id}.txt", "wb") as file:
            pickle.dump(transcript, file)


## Cleaning

In [None]:
import pickle

# Initialize an empty dictionary to store loaded transcripts
loaded_transcripts = {}


# Iterate over URL IDs and load corresponding pickled files
for url_id in data.URL_ID:
    file_path = f"transcripts/{url_id}.txt"
    try:
        # Load pickled data
        with open(file_path, "rb") as file:
            transcript = pickle.load(file)
            # Store the loaded transcript in the dictionary
            loaded_transcripts[url_id] = transcript
    except FileNotFoundError:
        print(f"File {file_path} not found.")

# Now loaded_transcripts dictionary will contain URL IDs as keys and loaded transcripts as values


In [None]:
import pandas as pd
data_df = pd.DataFrame(list(loaded_transcripts.items()), columns=['URL_ID', 'transcript'])
# Sort the DataFrame by URL_ID
data_df = data_df.sort_values(by='URL_ID').reset_index(drop=True)
data_df.head()


In [None]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
# # Let's take a look at the updated text
# data_clean = pd.DataFrame(data_df.transcript.apply(round1))
# data_clean.head()

data_df['transcript']= data_df.transcript.apply(round1)
data_df.head()

In [None]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
# data_clean.head()

data_df['transcript']= data_df.transcript.apply(round2)
data_df.head()

In [None]:
data_df.to_pickle("corpus.pkl")

## StopWords

In [None]:
import os

stopwords_directory = "StopWords"

stopwords = []

for filename in os.listdir(stopwords_directory):
    filepath = os.path.join(stopwords_directory, filename)
    with open(filepath, "r",encoding="ISO-8859-1") as file:
        stopwords.extend(file.read().splitlines())

stopwords = list(set(stopwords))
len(stopwords)

In [None]:
def remove_stopwords(transcript_text):
    words = transcript_text.split()

    # Filter out stopwords
    filtered_words = []
    for word in words:
        if word.lower() not in stopwords:
            filtered_words.append(word)

    # Join the filtered words back into a single string
    filtered_transcript_text = ' '.join(filtered_words)

    return filtered_transcript_text

# Apply the remove_stopwords function to each transcript in the DataFrame
data_df['transcript'] = data_df['transcript'].apply(remove_stopwords)


In [None]:
data_df.head()

## EDA

In [None]:
import os
import pandas as pd
import nltk
from textblob import TextBlob
from nltk.corpus import cmudict

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')

# Load positive and negative word lists
positive_words = set(open('MasterDictionary/positive-words.txt', encoding='ISO-8859-1').read().splitlines())
negative_words = set(open('MasterDictionary/negative-words.txt', encoding='ISO-8859-1').read().splitlines())


# Create an empty DataFrame to store the output data
output_data = pd.DataFrame(columns=["URL_ID","POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE",
                                     "SUBJECTIVITY SCORE", "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS",
                                     "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT",
                                     "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"])

# Function to calculate syllables in a word
def syllable_count(word):
    d = cmudict.dict()
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        # If word not found in CMU dictionary, approximate syllable count based on length
        return max(1, len(word) / 3)

# Function to perform textual analysis and compute variables
def compute_text_variables(article_text):
    # Tokenize text into words and sentences
    words = nltk.word_tokenize(article_text)
    sentences = nltk.sent_tokenize(article_text)

    # Compute positive and negative scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)

    # Compute polarity score
    denominator = (positive_score + negative_score) + 0.000001
    polarity_score = (positive_score - negative_score) / denominator

    # Compute subjectivity score
    total_words = len(words)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    # Compute other variables
    avg_sentence_length = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences) / len(sentences)
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
    percentage_complex_words = (complex_word_count / total_words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = total_words / len(sentences)
    avg_syllables_per_word = sum(syllable_count(word) for word in words) / total_words
    personal_pronouns = sum(1 for word, pos in nltk.pos_tag(words) if pos in ['PRP', 'PRP$', 'WP', 'WP$'])
    avg_word_length = sum(len(word) for word in words) / total_words

    # Return computed variables
    return (positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
            percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, total_words,
            avg_syllables_per_word, personal_pronouns, avg_word_length)


In [None]:
j=0
# Iterate over rows of the DataFrame using .iterrows()
for index, row in data_df.iterrows():
    # Compute text variables for each transcript
    text_variables = compute_text_variables(row['transcript'])
    
    # Extract URL_ID from the row
    url_id = row['URL_ID']
    
    # Combine URL_ID with text variables
    text_variables = [url_id] + list(text_variables)
    
    # Add text variables to the output DataFrame
    output_data.loc[index] = text_variables
    
    # Increment the index for the next row
    j += 1
    
    # Print the text variables for debugging or monitoring
    print(text_variables)


In [None]:
output_data

In [None]:
final_output_data = pd.merge(data, output_data, on="URL_ID")

In [None]:
final_output_data.to_excel("Output_Data_Structure.xlsx", index=False)