In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
# STEP 1: Read in the dataset and skip the initial header lines

df = pd.read_csv('../../../../../spiced_week_06_data/ECA_blend_tg/TG_STAID000001.txt', skiprows=19)

In [3]:
# Display the column names, how could we clean them?

df.head()

Unnamed: 0,STAID,SOUID,DATE,TG,Q_TG
0,1,35381,18600101,21,1
1,1,35381,18600102,46,1
2,1,35381,18600103,31,1
3,1,35381,18600104,37,1
4,1,35381,18600105,31,1


In [25]:
# STEP 2: Clean the column names

df.columns = df.columns.str.lower().str.strip()

In [26]:
df

Unnamed: 0,staid,souid,date,tg,q_tg
0,1,35381,18600101,21,1
1,1,35381,18600102,46,1
2,1,35381,18600103,31,1
3,1,35381,18600104,37,1
4,1,35381,18600105,31,1
...,...,...,...,...,...
59681,1,35381,20230527,-9999,9
59682,1,35381,20230528,-9999,9
59683,1,35381,20230529,-9999,9
59684,1,35381,20230530,-9999,9


In [28]:
# STEP 3: Cast the `date` column into the `datetime` format

pd.to_datetime(df['date'], format = '%Y%m%d')

0       1860-01-01
1       1860-01-02
2       1860-01-03
3       1860-01-04
4       1860-01-05
           ...    
59681   2023-05-27
59682   2023-05-28
59683   2023-05-29
59684   2023-05-30
59685   2023-05-31
Name: date, Length: 59686, dtype: datetime64[ns]

In [29]:
df['date'] = pd.to_datetime(df['date'], format = '%Y%m%d')

In [32]:
# What is the shape of the dataframe? 

df.shape

(59686, 5)

In [35]:
# STEP 4: Only select valid observations (the ones where `q_tg==0`)

mask = df['q_tg'] == 0
df = df[mask]
df

Unnamed: 0,staid,souid,date,tg,q_tg
313,1,35381,1860-11-09,0,0
334,1,35381,1860-11-30,0,0
346,1,35381,1860-12-12,0,0
1052,1,35381,1862-11-18,0,0
1517,1,35381,1864-02-26,0,0
...,...,...,...,...,...
53655,1,35381,2006-11-26,80,0
53656,1,35381,2006-11-27,70,0
53657,1,35381,2006-11-28,71,0
53658,1,35381,2006-11-29,63,0


In [37]:
# What is the shape of the dataframe after the filtering?

df.shape

(45203, 5)

In [44]:
# STEP 5: Drop the columns `souid` and `q_tg`

#df.pop('souid')
#df.pop('q_tg')
df.head()

Unnamed: 0,staid,date,tg
313,1,1860-11-09,0
334,1,1860-11-30,0
346,1,1860-12-12,0
1052,1,1862-11-18,0
1517,1,1864-02-26,0


In [41]:
# What is the shape of the dataframe now?

df.shape

(45203, 3)

In [47]:
# Take a look at the first 5 rows and all of the columns

In [4]:
# Now wrap all five steps that transform the data into a function:

def parse_file(filename):
    df = pd.read_csv(f'../../../../../spiced_week_06_data/ECA_blend_tg/{filename}', skiprows=19)
    df.columns = df.columns.str.lower().str.strip()
    df['date'] = pd.to_datetime(df['date'], format = '%Y%m%d')
    df = df[df['q_tg'] == 0]
    df.pop('souid')
    df.pop('q_tg')
    return df

In [5]:
# test the function on a file

testdf = parse_file('TG_STAID000001.txt').tail()
#testdf.dtypes
testdf.tail()

Unnamed: 0,staid,date,tg
53655,1,2006-11-26,80
53656,1,2006-11-27,70
53657,1,2006-11-28,71
53658,1,2006-11-29,63
53659,1,2006-11-30,84


## Objective 2: Automation

Loop over all files read in the data and append the data frame it to a single' text file `mean_temperature.csv`: 

```python
from tqdm import tqdm

with ___("./data/mean_temperature.csv", mode="w", newline='') as file:
    for ___ in tqdm(os.listdir(___)):
        if 'TG_STAID' ___ filename:
            df = ___(filename)
            ___.to_csv(file, index=False, header=False)
```

- Use the `tqdm` method to generate a progress bar while looping over the files
- Only process files that contain `TG_STAID` in their filename. 

>### NOTE:
>This step can take around 15 minutes. It also creates a csv file that is over `2GB` large!
>### Test it with a small number data-files first! 
>### Apply slicing in here: `os.listdir(path)[0:10]` 

In [6]:
# recap os.listdir(directory)

os.listdir('../../../../../spiced_week_06_data/ECA_blend_tg')[2:10]

# which directory can we use in the for-loop?

['mean_temperature_small.csv',
 'sources.txt',
 'stations.txt',
 'TG_STAID000001.txt',
 'TG_STAID000002.txt',
 'TG_STAID000003.txt',
 'TG_STAID000004.txt',
 'TG_STAID000005.txt']

In [70]:
# create a small version first with 2-10 datasets as "mean_temperature_small.csv"  
# use os.listdir('../data/ECA_blend_tg/')[0:10] 
# we will also use the small csv file for the upload test !

from tqdm import tqdm

with open("../../../../../spiced_week_06_data/ECA_blend_tg/mean_temperature_small.csv", mode="w", newline='') as file:
    for filename in tqdm(os.listdir('../../../../spiced_week_06_data/ECA_blend_tg')[2:10]):
        if 'TG_STAID' in filename:
            df = parse_file(filename)
            df.to_csv(file, index=False, header=False)


100%|██████████████████████████████████| 8/8 [00:01<00:00,  7.52it/s]


In [74]:
# read in the CSV, check to see if the data looks correct.

df = pd.read_csv('../../../../../spiced_week_06_data/ECA_blend_tg/mean_temperature_small.csv', names = ['staid', 'date', 'tg'])
df

Unnamed: 0,staid,date,tg
0,1,1860-11-09,0
1,1,1860-11-30,0
2,1,1860-12-12,0
3,1,1862-11-18,0
4,1,1864-02-26,0
...,...,...,...
265990,6,2008-09-26,100
265991,6,2008-09-27,117
265992,6,2008-09-28,88
265993,6,2008-09-29,49


In [72]:
# change the file name to "mean_temperature.csv" and run your code with ALL files (5-10 min)


# create a small version first with 2-10 datasets as "mean_temperature_small.csv"  
# use os.listdir('../data/ECA_blend_tg/')[0:10] 
# we will also use the small csv file for the upload test !

from tqdm import tqdm

with open("../../../../../spiced_week_06_data/ECA_blend_tg/mean_temperature.csv", mode="w", newline='') as file:
    for filename in tqdm(os.listdir('../../../../../spiced_week_06_data/ECA_blend_tg')):
        if 'TG_STAID' in filename:
            df = parse_file(filename)
            df.to_csv(file, index=False, header=False)

100%|████████████████████████████| 7070/7070 [09:30<00:00, 12.39it/s]


In [3]:
# let's check the file. import the mean_temperature.csv (it needs time to load)

df_big = pd.read_csv('../../../../../spiced_week_06_data/ECA_blend_tg/mean_temperature.csv', names = ['staid', 'date', 'tg'])
df_big

MemoryError: Unable to allocate 1.76 GiB for an array with shape (117849311,) and data type complex128

In [None]:
# how many rows? 



In [None]:
# how many unique stations?



> ### Our big CSV file is ready for the upload! 

## Objective 3: SQL

As the file is so big we process it outside of python and with `psql`. The `\copy` 
command is one of the fastest way of bulk loading large amounts of data into a database.

### 1. Create `temperature.sql` file with a script that contains:

- A table definition for the table `mean_temperature`
- A foreign key constraint for the column `staid`**(skip if you don't have station table in your climate DB)**
- A `\copy` statement that imports the data

Use this SQL script as a reference:

```postgresql

SELECT transaction_timestamp();

BEGIN;

___ ___ IF EXISTS mean_temperature CASCADE;

CREATE TABLE ___ (
    staid INT,
    date ___,
    ___ ___
);

\COPY ___ FROM ___  WITH (HEADER false, FORMAT csv);

COMMIT;

SELECT transaction_timestamp();
```

**BIG DATA**:
As this step depends heavily on the speed of your network and the processing power of your database, it can take up to several hours to complete.

> **NOTE:** The `BEGIN` and `COMMIT` statements that are wrapped around the actual queries 
setup a transaction. It bundles all statements into a single all-or-nothing operation.

## TRY to use the SQL with THE SMALL VERSION FIRST ! 
## ``mean_temperature_small.csv``

> ### Now we have an .sql file with a script which creates a table and populates it with data from our .csv file

### 2. connect to the climate DB with the psql client via terminal

- assuming you created the `climate` database before
- open a terminal and enter your RDBMS connection via `psql` and add `-d` with the database name and `-f` with the sql file which shall be executed  

> `psql -U postgres -h <placeholder_for_your_IP> -p 5432 -d climate -f temperature.sql`


### 3. verify that the mean_temperature table is now populated with data
Hint (several options)

## Last step: Foreign Keys

assuming that the stations table has a primary key on staid. Add a foreign key to the mean_temperature table using SQL query:

```sql
ALTER TABLE mean_temperature
ADD FOREIGN KEY (staid) REFERENCES stations(staid);
```
