In [106]:
import PyPDF2
import pandas as pd
from textblob import TextBlob
import plotly.graph_objects as go
# surpress warnings
import warnings
warnings.filterwarnings('ignore')

In [107]:
# create plotly gauge chart for sentiment
def sentiment_gauge(sentiment_polarity, file):
    
    
    fig = go.Figure(go.Indicator(
        #plot size and background color
        
        mode="gauge+number",
        value=sentiment_polarity,
        domain={'x': [0, 1], 'y': [0, 1]},
        title={'text': f"Sentiment Polarity of {file}"},
        gauge={'axis': {'range': [-1, 1]},

               'steps': [
            {'range': [-1, -0.5], 'color': "red"},
            {'range': [-0.5, 0.5], 'color': "lightgrey"},
            {'range': [0.5, 1], 'color': "lightgreen"}],
            'threshold': {'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': sentiment_polarity}}))

    fig.show()



In [108]:

def sentimentPDF(filePath: str):
    # create a df with one column called line
    df = pd.DataFrame(columns=['line'])
    # Open the PDF file in read-binary mode
    # ECSO_CFS_Ireland_2021  Construction Sector Performance and Capacity
    fileName = filePath.split('/')[-1]
    with open(filePath, 'rb') as file:
        # Create a PDF reader object
        reader = PyPDF2.PdfReader(file)

        # Initialize an empty string to store the extracted text
        extracted_text = ""

        # Iterate over each page in the PDF
        for page in reader.pages:
            # Extract the text from the page
            extracted_text += page.extract_text()
            for line in page.extract_text().split('\n'):
                df = df.append({'line': line}, ignore_index=True)

    # Perform sentiment analysis on the extracted text using TextBlob
    blob = TextBlob(extracted_text)

    # Get the overall sentiment polarity (-1 to 1) and subjectivity (0 to 1)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity

    # Print the sentiment polarity and subjectivity
    print("Sentiment Polarity:", sentiment_polarity)
    print("Sentiment Subjectivity:", sentiment_subjectivity)
    sentiment_gauge(sentiment_polarity, fileName)
    return sentiment_polarity


In [109]:
def sentimentPDFdata(files: list):
    # ccreate a df with 2 columns called file and sentiment
    df = pd.DataFrame(columns=['file', 'sentiment'])
    for file in files:
        fileName = file.split('/')[-1]
        df = df.append({'file': fileName, 'sentiment': sentimentPDF(file)}, ignore_index=True)
    return df

In [111]:
files = ['Data/sentiment/scsi-pwc-construction-market-monitor-2022.pdf','Data/sentiment/Construction Sector Performance and Capacity.pdf', 'Data/sentiment/ECSO_CFS_Ireland_2021.pdf', 'Data/sentiment/Construction Sector Performance and Capacity.pdf']

sentimentTable = sentimentPDFdata(files)
sentimentTable.head()   

Sentiment Polarity: 0.08525243699555626
Sentiment Subjectivity: 0.39143696687274665


Sentiment Polarity: 0.05488117535959469
Sentiment Subjectivity: 0.38416030747368135


Sentiment Polarity: 0.04281541141734401
Sentiment Subjectivity: 0.30557841524643214


Sentiment Polarity: 0.05488117535959469
Sentiment Subjectivity: 0.38416030747368135


Unnamed: 0,file,sentiment
0,scsi-pwc-construction-market-monitor-2022.pdf,0.085252
1,Construction Sector Performance and Capacity.pdf,0.054881
2,ECSO_CFS_Ireland_2021.pdf,0.042815
3,Construction Sector Performance and Capacity.pdf,0.054881


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
# show duplicates
df[df.duplicated()].to_excel('Data/sentiment/duplicates.xlsx')
# drop duplicates
df.drop_duplicates(inplace=True)


In [None]:
df.head()


In [None]:
# show empty lines that contain only spaces
df[df['line'].str.isspace()].count()


In [None]:
# drop lines with only spaces
df.drop(df[df['line'].str.isspace()].index, inplace=True)
df.head()


In [None]:
# count of lines with only numbers
df[df['line'].str.isnumeric()].count()


In [None]:
# drop lines with only numbers
df.drop(df[df['line'].str.isnumeric()].index, inplace=True)


In [None]:
df.to_excel('Data/sentiment/Construction Sector Performance and Capacity.xlsx')


In [None]:
# create plotly gauge chart for sentiment
import plotly.graph_objects as go

fig = go.Figure(go.Indicator(
    mode="gauge+number",
    value=sentiment_polarity,
    domain={'x': [0, 1], 'y': [0, 1]},
    title={'text': "Sentiment Polarity"},
    gauge={'axis': {'range': [-1, 1]},

           'steps': [
        {'range': [-1, -0.5], 'color': "red"},
        {'range': [-0.5, 0.5], 'color': "lightgrey"},
        {'range': [0.5, 1], 'color': "lightgreen"}],
        'threshold': {'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': sentiment_polarity}}))

fig.show()
