# Section 3 - WebScraping : EU Construction Industry

webscraping references:
https://towardsdatascience.com/web-scraping-news-articles-in-python-9dd605799558
https://medium.com/@wisjnujudho/how-to-scrape-google-news-top-stories-bs4-nopagination-80b882a214e5
https://www.geeksforgeeks.org/web-scraping-financial-news-using-python/

## Section 3.1 Importing Libraries

In [1]:
#importing required libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from nltk.tokenize import word_tokenize

In [2]:
#https://www.youtube.com/watch?v=71hlQDBbbLM&list=PLz-0BiySzeQUuXoD4_mT3lex6HNi7f-YH&index=2
# webdriver_service = Service('C:\webdrivers\chromedriver.exe')
# driver = webdriver.Chrome(options=options, service=webdriver_service)
# driver.get(url)

# wait = WebDriverWait(driver, 15)
# element=driver.find_element_by_xpath('//*[@id="didomi-notice-agree-button"]')
# element.click()
# time.sleep(2)
# #driver.quit()

## Section 3.2 Webscraping from Independent

In [3]:
#defining the URl - query search on independent for topics realted to construction industry and housing in Ireland
url = "https://www.independent.ie/search?keyword=construction+industry+housing+ireland&daterange=all&datestart=&dateend="

In [4]:
#pulling the request
r1 = requests.get(url)
r1.status_code
coverpage=r1.content

#creating soup, using the html parser
soup1 = BeautifulSoup(coverpage, 'html.parser')

#identifying the news items from the HTML CSS
headlines_news = soup1.find_all('li', class_='indo-d382f4d9_marginbottom5 indo-ddd60156_paddingbottom5 indo-ddd60156_paddingleft0 indo-ddd60156_paddingright0 indo-ddd60156_paddingtop0 indo-9d6383c7_listitem indo-9d6383c7_listitemdivider')  #vhange this
len(headlines_news)

10

In [5]:
number_of_articles = len(headlines_news)

In [6]:
#creating empty lists
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
            
    #retreiving the link for the article
    link = headlines_news[n].find('a')['href']
    list_links.append(link)
    
    #retriving the title of the article
    title = headlines_news[n].find('a').get_text()
    list_titles.append(title)
    
    #reading in the content that can be accessed without subscription
    article = requests.get(f'https://www.independent.ie{link}') #adding the inital link to the article link
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html.parser')
    body = soup_article.find_all('div', class_='indo-1d70522a_marginbottom5 indo-1d70522a_marginleft0 indo-1d70522a_marginright0 indo-26838ee5_paddingleft0 indo-26838ee5_paddingright0 indo-7199e23_root indo-804a7bc3_root')
    x = body[0].find_all('p')
    
    #getting the text from the articles
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

In [7]:
#creating dataframes from the list data and merging into data frames
list_titles_df=pd.DataFrame(list_titles, columns=['Title'])
news_contents_df=pd.DataFrame(news_contents, columns=['Content'])
final_data=pd.merge(list_titles_df, news_contents_df, how='outer', left_index=True, right_index=True)
final_data

Unnamed: 0,Title,Content
0,Homes Home Truths: Why a third of all new home...,What if a full one third of all the new homes ...
1,Irish Business Tom McEnaney: Builders of the w...,The critical issue impacting the number of hou...
2,Comment Eoin O’Malley: Ireland needs more buil...,The Government was excoriated in the Dáil last...
3,Business Public Private Partnership’s bad rap ...,Ray Wilson knows public private partnership (P...
4,Irish News Housing Minister Darragh O’Brien on...,"If self-confidence built homes, Darragh O’Brie..."
5,Personal Finance One solution to the housing c...,Declan Dunne rarely focuses on blame or on the...
6,Irish Business More than half of developers ha...,Over half of housing developers have postponed...
7,Personal Finance Richard Curran: I warned of a...,They say the past is a different country. And ...
8,World Business Spectre of 2008 crisis hovers o...,The collapse of Silicon Valley Bank (SVB) and ...
9,"Irish News An extra 50,000 homes could be deli...","An extra 50,000 homes can be delivered in the ..."


In [8]:
#exporting to a csv
final_data.to_csv('webscraped_data.csv', index=False)

# Section 5 - Dashboard : EU Construction Industry

In [9]:
#pip install jupyter-dash

In [10]:
#importing libraries
import plotly
import chart_studio.dashboard_objs
import IPython.display
from IPython.display import Image
import plotly.graph_objs as go
import chart_studio.plotly as py
import numpy as np
import plotly.express as px
import wfdb
from jupyter_dash import JupyterDash
from dash import Dash, html, dcc, callback, Output, Input, dash_table
from dash.dependencies import Input, Output
import pandas as pd
from dash import html
import dash
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import base64
from scipy.stats import spearmanr
import seaborn as sns
import dash_bootstrap_components as dbc

In [11]:
#importing data
data_all = pd.read_csv('merged_data_2020_ire.csv')
data_news = pd.read_csv('webscraped_data.csv')
combined_data = pd.read_csv('combined_data.csv')

In [12]:
data_news = data_news['Title'].str[:-19]
data_news=pd.DataFrame(data_news)
data_news

Unnamed: 0,Title
0,Homes Home Truths: Why a third of all new home...
1,Irish Business Tom McEnaney: Builders of the w...
2,Comment Eoin O’Malley: Ireland needs more buil...
3,Business Public Private Partnership’s bad rap ...
4,Irish News Housing Minister Darragh O’Brien on...
5,Personal Finance One solution to the housing c...
6,Irish Business More than half of developers ha...
7,Personal Finance Richard Curran: I warned of a...
8,World Business Spectre of 2008 crisis hovers o...
9,"Irish News An extra 50,000 homes could be deli..."


In [13]:
#convert the year-quarter parameter into a date time format
data_all['Quarter']=data_all['Year-Quarter'].str[6:]
#converting year and quarter to strings
data_all['Year'] = data_all['Year'].astype(str)
#creating new dataframe so the prep doesn't get changed
data_dash=data_all
#replacing the year=quarter values with datetime year-quarter
data_dash['Year-Quarter']=pd.to_datetime(data_dash['Year'] + '-' + data_dash['Quarter'], format='%Y-%m')
data_dash = data_dash.drop('Missing', axis=1)
data_dash = data_dash.sort_values(by=['Year-Quarter', 'Country']).reset_index()
#checking dtype
data_dash.head()

Unnamed: 0,index,Country,Year-Quarter,Production Volume Index,Production Volume Flag,Output Price Index in Construction,Output Price Flag in Construction,Employment Index,Hours Worked Index,Gross Wage and Salary Index,...,Useful Floor Area m^2 Flag Residental for Communities,Useful Floor Area m^2 Flag Residential,Useful Floor Area m^2 Flag Residential except for Communities,Useful Floor Area m^2 Flag Two-Plus Dwellings,House Price Index,House Price Flag,GDP,GDP Flag,Year,Quarter
0,72,AT,2000-01-01,51.6,,68.9,s,89.9,82.8,54.0,...,,,,,,,6330.0,,2000,1
1,206,BE,2000-01-01,98.7,,74.7,,79.5,96.0,57.1,...,,,,,,,6030.0,,2000,1
2,298,BG,2000-01-01,31.0,,,,75.0,70.6,18.3,...,,,,,,,360.0,,2000,1
3,391,CH,2000-01-01,66.0,,83.7,e,83.2,,,...,,,,,,,9840.0,,2000,1
4,484,CY,2000-01-01,174.4,,63.8,s,109.7,127.0,88.8,...,,,,,,,3620.0,,2000,1


In [22]:
# Initialize the app
app = JupyterDash(__name__)

# App layout
app.layout = html.Div([
    html.Div(children='EU Construction Industry', style={'font-size': '48px', 'text-align': 'center', 'margin-bottom': '30px'}),
    html.H1(children='Construction Index Values Over the Years by Country', style={'font-size': '25px', 'text-align': 'center'}),
    html.H2(children='Select your Y-Axis Parameter to View a Range of Index Values related to the Construction Industry', style={'font-size': '20px', 'text-align': 'center'}),
    html.Div([
        html.Label('Select Y-Axis:', style={'font-size': '18px'}),
        dcc.Dropdown(
            id='Y-Axis',
            options=[{'label': column, 'value': column} for column in data_dash.columns
                     if np.issubdtype(data_dash[column].dtype, np.number)],
            value='Production Volume Index'
        ),
    ], style={'width': '300px','margin-bottom': '5px', 'justify-content': 'center'}),
    dcc.Graph(id='graph'),
    html.H1('Headlines for the Construction Industry from the Independent News', style={'text-align': 'center', 'font-size': '25px', 'margin-top': '30px'}),
    html.H1('Top 10 Headline Titles', style={'text-align': 'center', 'font-size': '20px'}),
    dash_table.DataTable(
        id='table',
        data=data_news.to_dict('records'),
        style_table={'margin': 'auto'},
        style_data={'text-align': 'center', 'font-size': '14px'},
        page_size=10,
        style_header={'display': 'none'}
    ),
    html.H1(children='Correlations of the Countries Construction Index Parameters', style={'font-size': '25px', 'text-align': 'left','margin-top': '30px'}),
    html.H1(children='Select your Country to View Correlation Matrix, Hover over the Squares to get the Values - Correlations performed as per Spearman', style={'font-size': '20px', 'text-align': 'left'}),
    html.Div([
        html.Label('Select Country:', style={'font-size': '18px'}),
        dcc.Dropdown(
            id='Country',
            options=[{'label': Country, 'value': Country} for Country in data_dash['Country'].unique()],
            value='AT',
        )
    ], style={'width': '300px', 'margin-bottom': '20px'}),
    dcc.Graph(id='heatmap'),
    
    html.H1(children='Forecasting Data', style={'font-size': '25px', 'text-align': 'center'}),
    html.H2(children='Forecasting Data for Ireland Production Volume Index using the Time Series ARIMA Model', style={'font-size': '20px', 'text-align': 'center'}),
    dcc.Graph(
        id='forecast',
        figure=px.line(combined_data, y=['Production Volume Index_IE', 'Prediction'], labels={'index': 'Year-Quarter', 'value': 'Production Volume Index'},
                       title='Actual vs Predicted Values for Production Volume Index using Time Series Forecasting').update_traces(mode='lines+markers')
    )
])



# Define the callback function
@app.callback(
    dash.dependencies.Output('graph', 'figure'),
    [dash.dependencies.Input('Y-Axis', 'value')],
)
def update_graph(selected_metric):
    # Create the figure
    fig = px.line(data_dash, x='Year-Quarter', y=selected_metric, color='Country',
                  title='Production Volume by Country')
    fig.update_layout(title_x=0.5, height=800, width=2500)

    return fig

# Callback for updating the correlation heatmap
@app.callback(
    dash.dependencies.Output('heatmap', 'figure'),
    [dash.dependencies.Input('Country', 'value')]
)
def update_heatmap(selected_countries):
    filtered_data = data_dash[data_dash['Country'] == selected_countries].drop(['Country', 'index'], axis=1)
    corr = filtered_data.corr(method='spearman')

    fig = go.Figure(data=go.Heatmap(z=corr.values,
                                   x=corr.columns,
                                   y=corr.index,
                                   colorscale='bluered'))
    fig.update_layout(title='Spearman Correlation Heatmap', title_x=0.5, height=1000, width=1000)

    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/



 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [16/Jun/2023 03:11:20] "GET /_alive_fee6ad93-0fed-4c61-a755-95f0129eb646 HTTP/1.1" 200 -


Dash app running on http://127.0.0.1:8050/


127.0.0.1 - - [16/Jun/2023 03:11:22] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-component-suites/dash/dash_table/async-highlight.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-component-suites/dash/dash_table/async-table.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "GET /_favicon.ico?v=2.10.2 HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 03:11:22] "POST /_dash-update-component HTTP/

In [32]:
#defining function to calculate the memory usage for each dataframe
def memory_usage(dataframes):
    for i, df in enumerate(dataframes):
        mem_usage = df.memory_usage(deep=True).sum() / (1024 ** 2)
        print(f"Memory usage of DataFrame {i+1}: {mem_usage:.2f} MB")

127.0.0.1 - - [16/Jun/2023 04:00:54] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_favicon.ico?v=2.10.2 HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-component-suites/dash/dash_table/async-highlight.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:54] "GET /_dash-component-suites/dash/dash_table/async-table.js HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:55] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [16/Jun/2023 04:00:55] "POST /_dash-update-component HTTP/

In [31]:
dataframes = [list_titles_df,news_contents_df,final_data,data_news,data_all,combined_data]
memory_usage(dataframes)

Memory usage of DataFrame 1: 0.00 MB
Memory usage of DataFrame 2: 0.00 MB
Memory usage of DataFrame 3: 0.01 MB
Memory usage of DataFrame 4: 0.00 MB
Memory usage of DataFrame 5: 3.07 MB
Memory usage of DataFrame 6: 0.00 MB
