In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import datetime as dt
import os
import pycountry
import statsmodels.api as sm
import matplotlib.pyplot as plt
import sys

## Short-term business statistics

In [82]:
# Production in Construction
Produ = glob.glob('Production in Construction/*.csv')
print(Produ)

# Building Permits
Building = glob.glob('Building Permit/*.csv')
print(Building)

# Construciton Producer
Cons_Prod = glob.glob('Construction Producer/*.csv')
print(Cons_Prod)

# Labor
Labor= glob.glob('Labor/*.csv')
print(Labor)

# STS
STS = glob.glob('Structural/*.csv')
print(STS)

# GFCF
GFCF = glob.glob('GFCF/*.csv')
print(GFCF)

['Production in Construction\\civil_a.csv', 'Production in Construction\\cons_a.csv']
['Building Permit\\Building Permit Annual.csv']
['Construction Producer\\GVA.csv']
['Labor\\LaborInput Annual Employment.csv', 'Labor\\Volume of Hours.csv', 'Labor\\Wages and Salaries.csv']
['Structural\\Annual Detailed Enterprise Production Value in Euros.csv', 'Structural\\Cosntruction by employment size class.csv', 'Structural\\Multiyear enterprise.csv', 'Structural\\Turnover statistics.csv']
['GFCF\\GFCF Construction.csv', 'GFCF\\GFCF Dwelling.csv']


# Production in Construction

Will be attempting to use same functions for similar eurostat datasets, as it is more effiecient

In [83]:
# Production in Construction Extraction and Information

# Function to print stats for each file
def data_summary(file):

    df = pd.read_csv(file)
    print(f'The columns in each data {files}: \n {df.columns}\n')
    print(f'The shape of {files} is\n {df.shape}\n')
    print(f'The Info of the Data {files}:\n {df.info()}\n')
    print(f'The Summary of the Data {files}: \n {df.describe()}\n')
    print(f'Null Value sum present in {files}:\n {df.isnull().sum()}\n')
    print(f'Duplicated of {files} is\n {df.duplicated().sum()}\n')
    print(f'Unique Value Sum in {files}:\n{df.nunique()}\n')

# Function for getting full country name using pycountry
def get_country_name(country_code):
      if country_code == 'UK':
             return 'United Kingdom'
      elif country_code == 'EL':
            return 'Greece'

      try:
             country = pycountry.countries.get(alpha_2=country_code).name
             return country
      except AttributeError:
             return "Invalid country code"
      except LookupError:
             return "Invalid country code"

# Clean and transform a dataset
def clean_and_transform(file,drop,val=None):

    df = pd.read_csv(file)

    # Drop the columns that are not needed
    df.drop(drop, axis=1, inplace=True)

    # Convert Time Period to datetime format
    df['TIME_PERIOD'] = pd.to_datetime(df['TIME_PERIOD'], format='%Y')
    df['TIME_PERIOD'] = df['TIME_PERIOD'].dt.year

    # Rename the columns
    if val != None:
        df = df.rename(columns={'TIME_PERIOD':'Year', 'OBS_VALUE': val})

    else:
        df = df.rename(columns={'TIME_PERIOD':'Year'})

    cleaned_df = df

    # Convert the geo to Country Name
    cleaned_df['Country'] = cleaned_df.apply(lambda row: get_country_name(row.geo), axis=1)
    cleaned_df.drop('geo', axis=1, inplace=True)

    return cleaned_df

# Transformation
concatenated_df = pd.DataFrame()
joined_df = None

cols_drop = ['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'nace_r2', 's_adj','OBS_FLAG']

# merging all the data files into a single df
for files in Produ:

    data_summary(files)

    cleaned_df = clean_and_transform(files,val='Production Value',drop=cols_drop)
    
    if joined_df is None:
        joined_df = cleaned_df  # Set the first cleaned DataFrame as joined_df
    else:
        joined_df = joined_df.merge(cleaned_df) # Merge the cleaned DataFrame with joined_df

joined_df.to_csv('Transformed Data/Production in Construction.csv', index=False)

The columns in each data Production in Construction\civil_a.csv: 
 Index(['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'nace_r2', 's_adj',
       'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE 1', 'OBS_FLAG'],
      dtype='object')

The shape of Production in Construction\civil_a.csv is
 (318, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     318 non-null    object 
 1   LAST UPDATE  318 non-null    object 
 2   freq         318 non-null    object 
 3   indic_bt     318 non-null    object 
 4   nace_r2      318 non-null    object 
 5   s_adj        318 non-null    object 
 6   unit         318 non-null    object 
 7   geo          318 non-null    object 
 8   TIME_PERIOD  318 non-null    int64  
 9   OBS_VALUE 1  292 non-null    float64
 10  OBS_FLAG     69 non-null     object 
dtypes: float64(1), int64(1), object(9)
memory usage:

For the Civil Engineering sector, the short-term business statistics (STS) are available for the following indicators:

The indicator is in 2015-100 indices, and the frequency of the Time Period is Annually.
The important columns for anlaysis are the following:

i. geo: Region indicator

ii. TIME_PERIOD: Time period

iii. OBS_VALUE: Observation value

iv. unit: Unit of measure

For the Construction sector, the short-term business statistics are available for the following indicators:
The indicator is in 2015-100 indices, and the frequency of the Time Period is Annually.
The important columns for anlaysis are the following:

i. geo: Region indicator   

ii. TIME_PERIOD: Time period

iii. OBS_VALUE: Observation value

iv. unit: Unit of measure

There are 25 unique countries in Civil Engineering and 27 in Construction.
minumun year period is 2010, although we have
that there are 10 missing values for OBS_VALUE in the Civil Engineering, none in Construction.



# Building Permit

In [84]:
# Building Permit Extraction and Information

concatenated_df = pd.DataFrame()
joined_df = None

cols_drop = ['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'cpa2_1', 's_adj','OBS_FLAG']

# Transformation
for file in Building:

    data_summary(files)

    cleaned_df = clean_and_transform(file,val='Metered Squared Value',drop=cols_drop)

    if joined_df is None:
        joined_df = cleaned_df  # Set the first cleaned DataFrame as joined_df
    else:
        joined_df = joined_df.merge(cleaned_df) # Merge the cleaned DataFrame with joined_df


cleaned_df.to_csv('Transformed Data/Building Permit.csv', index=False)

The columns in each data Production in Construction\cons_a.csv: 
 Index(['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'nace_r2', 's_adj',
       'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG'],
      dtype='object')

The shape of Production in Construction\cons_a.csv is
 (348, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     348 non-null    object 
 1   LAST UPDATE  348 non-null    object 
 2   freq         348 non-null    object 
 3   indic_bt     348 non-null    object 
 4   nace_r2      348 non-null    object 
 5   s_adj        348 non-null    object 
 6   unit         348 non-null    object 
 7   geo          348 non-null    object 
 8   TIME_PERIOD  348 non-null    int64  
 9   OBS_VALUE    348 non-null    float64
 10  OBS_FLAG     50 non-null     object 
dtypes: float64(1), int64(1), object(9)
memory usage: 30.

## VALUE ADDED BY CONSTRUCTION in MILLION EURO

In [85]:
# Construction Producer Price Index Extraction and Information

concatenated_df = pd.DataFrame()
joined_df = None

cols_drop = ['DATAFLOW', 'LAST UPDATE', 'freq', 'nace_r2', 'na_item','OBS_FLAG']

# Transformation
for file in Cons_Prod:

    data_summary(files)

    cleaned_df = clean_and_transform(file,val='Construction Producer Price',drop=cols_drop)

    if joined_df is None:
        joined_df = cleaned_df  # Set the first cleaned DataFrame as joined_df
    else:
        joined_df = joined_df.merge(cleaned_df) # Merge the cleaned DataFrame with joined_df

cleaned_df.to_csv('Transformed Data/GVA.csv', index=False)

The columns in each data Production in Construction\cons_a.csv: 
 Index(['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'nace_r2', 's_adj',
       'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG'],
      dtype='object')

The shape of Production in Construction\cons_a.csv is
 (348, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     348 non-null    object 
 1   LAST UPDATE  348 non-null    object 
 2   freq         348 non-null    object 
 3   indic_bt     348 non-null    object 
 4   nace_r2      348 non-null    object 
 5   s_adj        348 non-null    object 
 6   unit         348 non-null    object 
 7   geo          348 non-null    object 
 8   TIME_PERIOD  348 non-null    int64  
 9   OBS_VALUE    348 non-null    float64
 10  OBS_FLAG     50 non-null     object 
dtypes: float64(1), int64(1), object(9)
memory usage: 30.

## Labor

In [86]:
#Labor Index Extraction and Information

concatenated_df = pd.DataFrame()
joined_df = None

cols_drop = ['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'nace_r2', 's_adj','OBS_FLAG']

# Transformation for multiple (3) data files

for file in Labor:

    data_summary(files)

    cleaned_df = clean_and_transform(file,val=None,drop=cols_drop)

    name = None

    if 'Employment' in file:
        name = "Labor_Employment.csv"
    elif 'Hours' in file:
        name = "Labor_Hours.csv"
    elif 'Wages' in file:
        name = "Labor_Wages.csv"
    else:
        print(f'Error with unexpected file :{file}')
        sys.exit(1)

    cleaned_df.to_csv(f'Transformed Data/{name}', index=False)

The columns in each data Production in Construction\cons_a.csv: 
 Index(['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_bt', 'nace_r2', 's_adj',
       'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG'],
      dtype='object')

The shape of Production in Construction\cons_a.csv is
 (348, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     348 non-null    object 
 1   LAST UPDATE  348 non-null    object 
 2   freq         348 non-null    object 
 3   indic_bt     348 non-null    object 
 4   nace_r2      348 non-null    object 
 5   s_adj        348 non-null    object 
 6   unit         348 non-null    object 
 7   geo          348 non-null    object 
 8   TIME_PERIOD  348 non-null    int64  
 9   OBS_VALUE    348 non-null    float64
 10  OBS_FLAG     50 non-null     object 
dtypes: float64(1), int64(1), object(9)
memory usage: 30.