In [1]:
# Import libraries
import geopandas as gpd
import pandas as pd
import psycopg2
import glob

# inspect data in the folder

In [2]:
# Read all the files in 'data' folder and save them as a list
files = sorted(glob.glob('../data/*')) # in ascending order
print(f'How many files in total: {len(files)}')

# Select only the monthly traffic state files
monthly_files = [file for file in files if file[-9:] == 'TRAMS.csv']
print(f'How many monthly traffic state files: {len(monthly_files)}')

monthly_files

How many files in total: 23
How many monthly traffic state files: 22


['../data/2019_01_Gener_TRAMS_TRAMS.csv',
 '../data/2019_02_Febrer_TRAMS_TRAMS.csv',
 '../data/2019_03_Marc_TRAMS_TRAMS.csv',
 '../data/2019_04_Abril_TRAMS_TRAMS.csv',
 '../data/2019_05_Maig_TRAMS_TRAMS.csv',
 '../data/2019_06_Juny_TRAMS_TRAMS.csv',
 '../data/2019_07_Juliol_TRAMS_TRAMS.csv',
 '../data/2019_08_Agost_TRAMS_TRAMS.csv',
 '../data/2019_09_Setembre_TRAMS_TRAMS.csv',
 '../data/2019_10_Octubre_TRAMS_TRAMS.csv',
 '../data/2019_11_Novembre_TRAMS_TRAMS.csv',
 '../data/2019_12_Desembre_TRAMS_TRAMS.csv',
 '../data/2020_01_Gener_TRAMS_TRAMS.csv',
 '../data/2020_02_Febrer_TRAMS_TRAMS.csv',
 '../data/2020_03_Marc_TRAMS_TRAMS.csv',
 '../data/2020_04_Abril_TRAMS_TRAMS.csv',
 '../data/2020_05_Maig_TRAMS_TRAMS.csv',
 '../data/2020_06_Juny_TRAMS_TRAMS.csv',
 '../data/2020_07_Juliol_TRAMS_TRAMS.csv',
 '../data/2020_08_Agost_TRAMS_TRAMS.csv',
 '../data/2020_09_Setembre_TRAMS_TRAMS.csv',
 '../data/2020_10_Octubre_TRAMS_TRAMS.csv']

In [3]:
# A data file containing geometry info for each traffic section of BCN
bcn_geom = [file for file in files if file not in monthly_files]
bcn_geom

['../data/transit_relacio_trams_format_long.csv']

In [4]:
# Sneak peek of each data I - monthly traffic data
traffic_example = pd.read_csv(monthly_files[0])

print(traffic_example.shape)
traffic_example.head()

(951405, 4)


Unnamed: 0,idTram,data,estatActual,estatPrevist
0,1,20190101000551,0,0
1,2,20190101000551,0,0
2,3,20190101000551,0,0
3,4,20190101000551,0,0
4,5,20190101000551,0,0


In [5]:
traffic_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 951405 entries, 0 to 951404
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   idTram        951405 non-null  int64
 1   data          951405 non-null  int64
 2   estatActual   951405 non-null  int64
 3   estatPrevist  951405 non-null  int64
dtypes: int64(4)
memory usage: 29.0 MB


In [6]:
# Sneak peek of each data II - traffic section geometries
geom_example = pd.read_csv(bcn_geom[0])

print(geom_example.shape)
geom_example.head()

(2784, 5)


Unnamed: 0,Tram,Tram_Components,Descripció,Longitud,Latitud
0,1,1,Diagonal (Ronda de Dalt a Doctor Marañón),2.11203535639414,41.384191
1,1,2,Diagonal (Ronda de Dalt a Doctor Marañón),2.101502862881051,41.381631
2,2,1,Diagonal (Doctor Marañón a Ronda de Dalt),2.111944376806616,41.384467
3,2,2,Diagonal (Doctor Marañón a Ronda de Dalt),2.101594089443895,41.381868
4,3,1,Diagonal (Doctor Marañón a Pl. Pius XII),2.112093343037027,41.384229


In [7]:
geom_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2784 entries, 0 to 2783
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Tram             2784 non-null   int64  
 1   Tram_Components  2784 non-null   int64  
 2   Descripció       2784 non-null   object 
 3   Longitud         2784 non-null   object 
 4   Latitud          2779 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 108.9+ KB


In [8]:
# How many geoms per 'Tram (= street section)'
geom_example.groupby('Tram').Tram_Components.count().unique()

array([ 2,  3, 10, 15, 12,  6,  9,  4, 14, 22, 17,  8, 11,  7,  5, 13, 20,
       19, 16, 18, 24, 26, 23, 33, 37, 21, 38])

* All data in csv format
* Geometry composed of 2-38 points

# Connect to the database

In [9]:
# Define a function connect to the database 'bcn_traffic'
# In case the connecting attempt fails, print the error message
def connect_to_db():
    try:
        #conn for connection
        conn =  psycopg2.connect(dbname='bcn_traffic',
                                 user='bcn',
                                 password='bcn',
                                 host='postgis',
                                 port='5432')

    except psycopg2.DatabaseError:
        print ("I am unable to connect the database")
    return conn

# Create tables parsed by month (1 table for each month)

In [10]:
# Create names for each table using monthly_files list

# Select year_month part only
table_names = [name[8:][:-16] for name in monthly_files]

# Modify table names
table_names = [name[8:]+name[:4] for name in table_names]

table_names[:3] #year_month

['Gener2019', 'Febrer2019', 'Marc2019']

In [54]:
# Function to create a table called 'taxi_x' inside the PostGIS database
def create_table(tablename):
    # connect to the db
    conn = connect_to_db()
    
    # Prepare a query to create table for NY taxi data
    q_create_table = f"""
                    drop table if exists {tablename};
                    create table {tablename}
                    (
                        idTram int,
                        data int,
                        estatActual int,
                        estatPrevist int
                    )
                    """

    try:
        cur = conn.cursor()  # initiate cursor (communication with db)
        #print('Connected')
        cur.execute(q_create_table)  # execute the query
        #print('Query executed')
        conn.commit()
        print(f'{tablename} created')

    except psycopg2.DatabaseError: # print error if fails
        print ("Failed to create the table")

    # Close the communication & connection with the postgis
    finally:
        cur.close()
        conn.close ()

In [55]:
# Create tables for monthly traffic observation data
for name in table_names:
    create_table(name)

Gener2019 created
Febrer2019 created
Marc2019 created
Abril2019 created
Maig2019 created
Juny2019 created
Juliol2019 created
Agost2019 created
Setembre2019 created
Octubre2019 created
Novembre2019 created
Desembre2019 created
Gener2020 created
Febrer2020 created
Marc2020 created
Abril2020 created
Maig2020 created
Juny2020 created
Juliol2020 created
Agost2020 created
Setembre2020 created
Octubre2020 created


# Read the files, transform the data and load it to the db

In [56]:
# Function to fill the previously created tables
def fill_table_with_data(filepath, tablename):
    # connect to the db
    conn = connect_to_db()
    
    copy_sql = f"""
               COPY {tablename}
               FROM {filepath}
               DELIMITER ','
               CSV HEADER;
               """

    try:
        cur = conn.cursor()
        total = 0  # count how many rows are inserted
        
        with open(filepath, 'r') as file:
            #next(file) # skip the first line
            
            #cur.copy_from(file, tablename, sep=',') # copy the csv file
            cur.copy_expert(sql=copy_sql, file=file)
            print('Copy worked')
            
            conn.commit() # commit the changes to the db
            print('Data inserted to the table')

    # Print error message if query fails
    except psycopg2.DatabaseError:
        print ("Failed to copy data to the table")

    finally:
        cur.close()
        conn.close ()
        #print('Connection closed')

In [57]:
fill_table_with_data(monthly_files[0], table_names[0])

Failed to copy data to the table
