 1. **Imports** https://www.kaggle.com/code/jpandeinge/nlp-analysis-of-pdf-documents/notebook


In [1]:
#!pip install PyPDF2
#!pip install textract
#!pip install autocorrect
#!pip install vaderSentiment
# pip install wordcloud

In [2]:
import PyPDF2
import warnings
import textract
from autocorrect import Speller
from nltk.tokenize import word_tokenize
import nltk
import re
import string
from nltk.corpus import stopwords, brown
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from autocorrect import spell
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sn
from collections import Counter
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [3]:
def to_lower(text):
    
    # Specll check the words
    spell = Speller(lang='en')

    texts = spell(text)

    return ' '.join([w.lower() for w in word_tokenize(text)])

In [4]:
def clean_text(lower_case):
    # split text phrases into words
    words = nltk.word_tokenize(lower_case)

    # Create a list of all the punctuations we wish to remove
    punctuations = ['.', ',', '/', '!', '?', ';',
                    ':', '(', ')', '[', ']', '-', '_', '%']

    # Remove all the special characters
    punctuations = re.sub(r'\W', ' ', str(lower_case))

    # Initialize the stopwords variable, which is a list of words ('and', 'the', 'i', 'yourself', 'is') that do not hold much values as key words
    stop_words = stopwords.words('english')

    # Getting rid of all the words that contain numbers in them
    w_num = re.sub('\w*\d\w*', '', lower_case).strip()

    # remove all single characters
    lower_case = re.sub(r'\s+[a-zA-Z]\s+', ' ', lower_case)

    # Substituting multiple spaces with single space
    lower_case = re.sub(r'\s+', ' ', lower_case, flags=re.I)

    # Removing prefixed 'b'
    lower_case = re.sub(r'^b\s+', '', lower_case)

    # Removing non-english characters
    lower_case = re.sub(r'^b\s+', '', lower_case)

    # Return keywords which are not in stop words
    keywords = [
        word for word in words if not word in stop_words and word in punctuations and word in w_num]

    return keywords


In [5]:
from wordcloud import WordCloud, STOPWORDS
import imageio
import matplotlib.pyplot as plt
import nltk
def wordImage(words):
    # Image used in which our world cloud output will be
    img1 = imageio.imread(
        r"C:\Users\sclifford\OneDrive - Gallarus Industry Solutions\MscDataAnalytics\MScDataAnalyticsCA2\Data\sentiment\itula.jpeg")
    hcmask1 = img1

    # Get 100 words based on the
    words_except_stop_dist = nltk.FreqDist(w for w in words[:100])
    wordcloud = WordCloud(stopwords=set(STOPWORDS), background_color='black',
                        mask=hcmask1, max_font_size=20).generate(" ".join(words_except_stop_dist))
    plt.imshow(wordcloud, interpolation='bilinear')
    fig = plt.gcf()
    fig.set_size_inches(10, 12)
    plt.axis('off')
    plt.title("Top most common 100 words ", fontsize=20)
    plt.tight_layout(pad=0)
    plt.savefig('Manifesto_top_100.jpeg')


In [6]:


def sentimentPDF(filePath: str):
    open_filename = open(filePath, 'rb')

    ind_manifesto = PyPDF2.PdfReader(open_filename)
    total_pages = len(ind_manifesto.pages)
    #total_pages
    count = 0
    text = ''

    # Lets loop through, to read each page from the pdf file
    while (count < total_pages):
        # Get the specified number of pages in the document
        mani_page = ind_manifesto.pages[count]
        # Process the next page
        count += 1
        # Extract the text from the page
        text += mani_page.extract_text()
        
    if text != '':
        text = text

    else:
        textract.process(open_filename, method='tesseract',
                        encoding='utf-8', langauge='eng')
            
    lower_case = to_lower(text)
    print(lower_case)  
    # Lemmatize the words
    wordnet_lemmatizer = WordNetLemmatizer()

    lemmatized_word = [wordnet_lemmatizer.lemmatize(
        word) for word in clean_text(lower_case)]

    # lets print out the output from our function above and see how the data looks like
    clean_data = ' '.join(lemmatized_word)
    print(clean_data)   
    df = pd.DataFrame([clean_data])
    df.columns = ['script']
    df.index = ['index']
    return df



In [7]:
# create plotly gauge chart for sentiment

def sentiment_gauge(blob, file):
    fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=blob.sentiment.polarity,
        domain={'x': [0, 1], 'y': [0, 1]},
        title={'text': f"Sentiment Polarity of {file}"},
        gauge={'axis': {'range': [-1, 1]},
            'steps': [
            {'range': [-1, -0.5], 'color': "red"},
            {'range': [-0.5, 0.5], 'color': "lightgrey"},
            {'range': [0.5, 1], 'color': "lightgreen"}],
            'threshold': {'line': {'color': "black", 'width': 4}, 'thickness': 0.75, 'value': blob.sentiment.polarity}}))

    fig.show()


In [8]:
import plotly.graph_objects as go

def sentimentBARChart(sentimentTable: pd.DataFrame):
    # Extract x-axis (categories) and y-axis (scores) data
    categories = sentimentTable['file']
    scores = sentimentTable['sentiment']

    # Define color array based on sentiment scores
    colors = ['green' if score >= 0 else 'red' for score in scores]
 
    # Create bar chart with custom colors
    fig = go.Figure(
        data=[go.Bar(x=categories, y=scores, marker=dict(color=colors))],
        layout=go.Layout(title='Sentiment Analysis', yaxis_title='Sentiment Score',yaxis=dict(range=[-1, 1]))
    )

    # Display the chart
    fig.show()



 Preprocess - Bag of Words model




In [9]:
#Preprocess - Bag of Words model
#  Counting the occurrences of tokens and building a sparse matrix of documents x tokens.
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def preProcessBagOfWords(df: pd.DataFrame):
    corpus = df.script
    vect = CountVectorizer(stop_words='english')

    # Transforms the data into a bag of words
    data_vect = vect.fit_transform(corpus)
    feature_names = vect.get_feature_names_out()
    data_vect_feat = pd.DataFrame(data_vect.toarray(), columns=feature_names)
    data_vect_feat.index = df.index
    data = data_vect_feat.transpose()
    # Find the top 1000 words written in the manifesto
    top_dict = {}
    for c in data.columns:
        top = data[c].sort_values(ascending=False)
        top_dict[c] = list(zip(top.index, top.values))
    #for x in list(top_dict)[0:100]:
        #print("key {}, value {} ".format(x,  top_dict[x]))
    words = []
    for president in data:
        top = [word for (word, count) in top_dict[president]]
        for t in top:
            words.append(t)

    print(words[:10])
    blob = TextBlob(df.script[0])
    blob.sentiment  
    return blob


In [10]:
def sentimentPDFdata(files: list):
    # ccreate a df with 2 columns called file and sentiment
    df_list = pd.DataFrame(columns=['file', 'sentiment'])
    for file in files:
        fileName = file.split('/')[-1]
        blob_ = preProcessBagOfWords(sentimentPDF(file))
        sentiment_gauge(blob_, fileName)
        #add a row to df_list
        
        df_list = pd.concat([df_list, pd.DataFrame({'file': [fileName], 'sentiment': [blob_.sentiment.polarity]})], ignore_index=True)
    return df_list

In [11]:
filesIreland = ['Data/sentiment/homeBuilding/246610_5161c6c9-559c-4b56-93ad-fe894e7ee4e0.pdf','Data/sentiment/homeBuilding/Construction Sector Performance and Capacity.pdf', 'Data/sentiment/homeBuilding/PII Housing Review September 2022.pdf','Data/sentiment/homeBuilding/SCSI_ResidentialPropertyReport2022_Final.pdf']
#filesIreland = ['Data/sentiment/homeBuilding/PII Housing Review September 2022.pdf']
sentimentIreland_df = sentimentPDFdata(filesIreland)
sentimentBARChart(sentimentIreland_df)
sentimentIreland_df


housing for all q4 2022 statistics table of contents summary -housing supply chain page 3 completions pages 4-5 commencements pages 5-6 planning permissions granted pages 6-7 summary -construction sector costs/capacity page 8 construction purchasing managers ’ index page 9 cost of building and construction materials pages 9-10 construction sector wages summary -property and rental market property prices home purchase loan approvals home purchase loan drawdowns help to buy schemepage 10 page 12 page 13 page 14 page 15 new tenancies registered page 16 page 17 change in structure of rental market page 19standardised average rents in new tenancies pages 17-18construction sector employment page 11summary -housing supply chain just under 30,000 new homes were completed in 2022 , exceeding the housing for all target for the year by 21 % 2022 saw 26,957 housing units commence , similar to 2019 ’ s result and a 12 % drop from 2021 ’ s annual figure after a strong h1 , q3 2022 saw a 41 % fall in

1 project ireland 2040 build 2022 : construction sector performance and capacity gov.ie/2040 july 2022 2 table of contents summary : ................................ ................................ .................... 3 section 1 : overview ................................ ................................ .8 section 2 : regional activity ................................ .................... 9 section 3 : investment and output ................................ ..... 15 section 4 : costs ................................ ................................ ..... 21 section 5 : employment and enterprise ............................ 30 section 6 : skills and knowledge ................................ ......... 35 section 7 : productivity ................................ ......................... 43 section 8 : sustainability ................................ ........................ 47 section 9 : conclusions and next steps ........................... 49 3 summary : regional activity 1 . in re

housing market review september 2022| property industry ireland housing market review september 20222 pii vision a sustainable irish property industry which is creative , responsive , competitive and well integrated in meeting the socio-economic needs of all the stakeholders in the built environment . pii mission to be the trusted partner and provider of “ evidence based ” information , policies and strategies for the property industry at national level , to the oireachtas , government , local authorities and agencies , and for the benefit of the people of ireland.developers contractors funders architects legal professional servicesestate agentsplanners project managerssurveyors materials manufacturerspii sectors cso data show dwelling completions increased by nearly 50 % in the first half of 2022 when compared with 2021 . this level of new home completions , amounting to 13,316 in the first six months , shows the scale of the sector ’ s response to the housing need . the new home deli

residential property market monitor june 2022june 20223contents society of chartered sur veyors ireland 38 merrion square dublin 2 01 644 5500 info @ scsi.ie www.scsi.ie residential property market monitor june 20224 key highlights 6 economic metrics 7 the property market 10 seller activity 11 buyer activity 13 buyer affordability metric 14 rental activity 15 acknowledgements 4key highlights sales market increase expected over the next 12 months . property prices expec ted to continue to increase in 2022 , but the rate of pr operty price inflation will reduce compared to 2021 levels . residential property market monitor june 2022 property price expectations market factors , such as construction of new units , availability of units and rental market movements , continue to underpin the majority of agents ’ expectations ( 65 % in q1 2022 ) . increase expected over the next 3 months.+3 % +4 % residential property market monitor june 2022 5complexity of rental legislation/regulation contin

Unnamed: 0,file,sentiment
0,246610_5161c6c9-559c-4b56-93ad-fe894e7ee4e0.pdf,0.096901
1,Construction Sector Performance and Capacity.pdf,0.051202
2,PII Housing Review September 2022.pdf,0.072262
3,SCSI_ResidentialPropertyReport2022_Final.pdf,0.033498


In [12]:
filesEurope = ['Data/sentiment/homeBuilding/europe/Emerging Trends in Real Estate Europe 2023 Report.pdf','Data/sentiment/homeBuilding/europe/at-property-index-2022-final.pdf','Data/sentiment/homeBuilding/europe/dp171_en.pdf']
sentimentEurope_df = sentimentPDFdata(filesEurope)
sentimentBARChart(sentimentEurope_df)
sentimentEurope_df

you are here contents about the report business environmentchapter 1 chapter 5 environmental & social impactchapter 2 20 years of emerging trends europechapter 7 city prospectsappendix executive summarywelcome real estate & capital marketschapter 3 cities to watchchapter 6 sectors to watchchapter 4 contentsyou are here executive summary contents “ we are on the cusp of quite a significant slowdown , both in the real economy and in the underlying real estate markets . ” pan-european investment managerexecutive summary madrid , spainyou are here executive summary zurich , switzerlandthe outbreak of war in ukraine has cast a long shadow over europe , and real estate , like every other industry , will have to deal with the economic and political fallout for the foreseeable future . while the industry leaders canvassed for this 20th edition of emerging t rends in real estate® europe report little direct impact on their property portfolios from russia ’ s invasion of ukraine , the war ’ s co

['price', 'dwelling', 'market', 'property', 'year', 'housing', 'increase', 'index', 'country', 'average']


['housing', 'price', 'house', 'household', 'affordability', 'tax', 'country', 'income', 'mortgage', 'area']


Unnamed: 0,file,sentiment
0,Emerging Trends in Real Estate Europe 2023 Rep...,0.098914
1,at-property-index-2022-final.pdf,0.072333
2,dp171_en.pdf,0.078818


In [13]:
buildCostfilesIreland = ['Data/sentiment/buildingCost/Ireland/2022-05-24_opening-statement-kevin-james-vice-president-society-of-chartered-surveyors-ireland_en.pdf','Data/sentiment/buildingCost/Ireland/256082_afbe94c3-ebf1-4201-9a4a-a6ac9cddc69a.pdf', 'Data/sentiment/buildingCost/Ireland/Rising-construction-costs-and-the-residential-real-estate-market-in-Ireland.pdf']
buildCostsentimentIreland_df = sentimentPDFdata(buildCostfilesIreland)
sentimentBARChart(buildCostsentimentIreland_df)
buildCostsentimentIreland_df

page | 1 society of chartered surveyors ireland opening statement to joint oireachtas committee on housing , local government & heritage on construction costs in housing ( land , design , labour & materials ) introduction the society of chartered surveyors ireland ( sc si ) welcomes the opportunity to share our insights on construction costs to the joint committee on housing , local government and heritage on construction costs and we thank the chairman and committee for the invitation . as the leading professional body for chartered surveying professionals working in the property , construction and land sectors across ireland , we undertake research on a wide range of economic , industry and practice -related issues in the public interest . chartered surveyor members of the scsi work across the built environment , in both the public and private sectors , providing advice to clients across the entire lifecycle of a build , from green field site through to design , mapping , budgets , p

residential construction cost study report may 2023 a shared government and construction sector group i nitiative www.gov.ie i ii table of contents minister ’ s foreword .................................................................................................... 1 glossary of abbreviations .......................................................................................... 1 glossary of terminology ............................................................................................ 5 1.0 executive summary ........................................................................................... 9 2.0 introduction ...................................................................................................... 23 3.0 review of existing literature .............................................................................. 31 4.0 methodology ....................................................................................................... 39 5.0 case st

rising construction costs and the residential real estate market in ireland filippo arigoni , gerard kennedy & neill killeen vol . 2022 , no . 12 rising construction costs and the residential real estate market in ireland filippo arigoni , gerard kennedy & neill killeen1 central bank of ireland october 2022 abstract construction costs are a key factor to consider when analysing the residential real estate market in ireland given their impact on housing supply . this note examines longer -term trends in construction cos ts in ireland and sho ws that these costs have increased steadily over the last twenty -five years and faster than general inflation over the same period . the increase in gross construction costs over this time was driven primarily by cost inflation in two periods , namely the early 2000s and the increases observed since march 2020 . some of these patterns have been accentuated by changes to the tax regime over the period 1998 -2008 . in the last two years , constructio

Unnamed: 0,file,sentiment
0,2022-05-24_opening-statement-kevin-james-vice-...,0.073203
1,256082_afbe94c3-ebf1-4201-9a4a-a6ac9cddc69a.pdf,0.051398
2,Rising-construction-costs-and-the-residential-...,0.043804


In [14]:
buildCostfilesEurope = ['Data/sentiment/buildingCost/Europe/22-cs11-_Construction_Building_Materials_-_Commentary_October_2022.pdf','Data/sentiment/buildingCost/Europe/ING-Think-eu-construction-outlook-optimism-among-contractors-despite-increasing-building-material-shortage.pdf', 'Data/sentiment/buildingCost/Europe/International Construction Costs 2022-2.pdf']
buildCostsentimentEurope_df = sentimentPDFdata(buildCostfilesEurope)
sentimentBARChart(buildCostsentimentEurope_df)
buildCostsentimentEurope_df

responsible statistician : pio francesco medolla email : materialstats @ beis.gov.uk media enquiries : 020 7215 1000 public enquiries : +44 ( 0 ) 207215 2820 next publication : 7 december 2022 monthly statistics of building materials and components commentary , october 2022 coverage : uk and great britain geographical area : country , region and county 2 november 2022 national statistics h eadline findings •the material price index for ‘ all work ’ increased by 16.7 % in september 2022 compared to the same month the previous year . •there was a 2.6 % increase in brick deliver ies in september 2022 c ompared to september 2021 , according to the seasonally adjus ted figures . •there was a 9.7 % decrease in concrete block deliveries in september 2022 compared to september 2021 , according to the seasonally adjusted figures . c hart 1 : construction material price indices , uk index , 2015 = 100 monthly statistics of building materials and components – october 2022 2 contents introduction 

eu construction outlook : contractors ’ optimism rising despite building material shortages we expect further growth in the eu construction sector this year and next , despite a continuing shortage of some building materials . the eu construction confidence indicator was positive at the start of 2022 . building volumes should be boosted by fresh investment from the eu recovery fund new homes being constructed in stellendam in the netherlands content further increasing material shortages- higher sales ' prices- eu construction confidence indicator positive- issuance of residential building permits stable- europe at a glance- low growth infrastructure sector- huge differences in the long run- more growth expected for eu construction sector- further increasing material shortages the outlook for construction in the european union has changed . back in september last year , we noted in our eu construction outlook that supply chain economic and financial analysis 9 february 2022 articledisru

1 international construction costs 2022 international construction costs 2022the year of inflation2 international construction costs 2022 construction , however , has again proved itself extremely adaptable during the last year in responding to the difficult circumstances . we have seen sustained delivery of housing and infrastructure across most global markets , better use of data , and increasing investment in technological solutions such as modern methods of construction , all of which can improve efficiencies and aid the drive to net-zero . this adaptability will prove vital as businesses ready themselves for the uncertain and inflationary environment ahead . this theme is central to our 2022 international construction costs index , which highlights dramatic price fluctuations in many regions around the globe . while there is healthy confidence in the future of the construction sector internationally , differing government covid-19 strategies have resulted in varying paces of recov

Unnamed: 0,file,sentiment
0,22-cs11-_Construction_Building_Materials_-_Com...,0.067373
1,ING-Think-eu-construction-outlook-optimism-amo...,0.062333
2,International Construction Costs 2022-2.pdf,0.096486
