# Data Preparation — Creating Master Files
The code imports the separate yearly temperature readings and creates three master dataframes: 
- Historic data (1983-2012)
- Recent data (2013-2022)
- Latest incomplete data (January 1, 2023-)

## Importing libraries

In [1]:
import os
import pandas as pd
import glob

## Importing files and creating master dataframes

### Historic temperatures (1983-2012)

#### Joining up all files into single dataframe

In [2]:
### creates empty list to write into
dfs = []

### reads all files from the folder
files = glob.glob("raw_data/yearly_data/historic/*.csv")

### loop runs through all the files
for file in files:
    ### skips first 10 rows and imports all the files
    df = pd.read_csv(file, skiprows = 10)
    ### appends file to the list
    dfs.append(df)

### joins up the files vertically    
data_hist = pd.concat(dfs, ignore_index = True)

### displays dataframe
data_hist

Unnamed: 0,Name,Longitude,Latitude,Elevation (ft),Date,ppt (inches),tmin (degrees F),tmean (degrees F),tmax (degrees F),tdmean (degrees F),vpdmin (hPa),vpdmax (hPa)
0,Alachua,-82.3576,29.6748,147.0,1990-01-01,0.24,51.7,65.8,79.8,60.9,0.16,18.00
1,Alachua,-82.3576,29.6748,147.0,1990-01-02,0.00,35.3,47.1,58.9,34.9,0.93,10.33
2,Alachua,-82.3576,29.6748,147.0,1990-01-03,0.00,40.0,52.6,65.3,44.0,1.15,11.13
3,Alachua,-82.3576,29.6748,147.0,1990-01-04,0.00,47.6,60.4,73.1,51.4,0.57,14.42
4,Alachua,-82.3576,29.6748,147.0,1990-01-05,0.00,52.0,64.4,76.8,60.3,0.47,13.53
...,...,...,...,...,...,...,...,...,...,...,...,...
734181,Washington,-85.6654,30.6106,121.0,1989-12-27,0.00,28.0,44.1,60.1,28.6,0.70,11.94
734182,Washington,-85.6654,30.6106,121.0,1989-12-28,0.00,29.3,47.3,65.3,36.7,0.31,14.45
734183,Washington,-85.6654,30.6106,121.0,1989-12-29,0.00,35.3,51.0,66.8,39.5,0.38,14.42
734184,Washington,-85.6654,30.6106,121.0,1989-12-30,0.00,41.8,54.6,67.5,48.5,0.26,10.85


#### Modifying dataframe

In [3]:
### renames columns
data_hist = data_hist.rename(columns = {"Name": "COUNTY", "Longitude": "LONG",
                                        "Latitude": "LAT", "Elevation (ft)": "ELEV", 
                                        "Date": "DATE", "ppt (inches)": "RAINFALL", 
                                        "tmin (degrees F)": "TMIN", "tmean (degrees F)": "TMEAN", 
                                        "tmax (degrees F)": "TMAX", "tdmean (degrees F)": "TDMEAN",
                                        "vpdmin (hPa)": "VPDMIN", "vpdmax (hPa)": "VPDMAX"})

### converts date to date-time format
data_hist["DATE"] = pd.to_datetime(data_hist["DATE"])

### converts and stores dates in YYYY and MM-YYYY formats
data_hist["MONTH_YEAR"] = pd.to_datetime(data_hist["DATE"]).dt.to_period("M")
data_hist["YEAR"] = pd.to_datetime(data_hist["DATE"]).dt.to_period("Y")

### converts and stores the dates as strings
data_hist["MONTH_YEAR_STR"] = data_hist["MONTH_YEAR"].astype(str)
data_hist["YEAR_STR"] = data_hist["YEAR"].astype(str)

### displays dataframe
data_hist

Unnamed: 0,COUNTY,LONG,LAT,ELEV,DATE,RAINFALL,TMIN,TMEAN,TMAX,TDMEAN,VPDMIN,VPDMAX,MONTH_YEAR,YEAR,MONTH_YEAR_STR,YEAR_STR
0,Alachua,-82.3576,29.6748,147.0,1990-01-01,0.24,51.7,65.8,79.8,60.9,0.16,18.00,1990-01,1990,1990-01,1990
1,Alachua,-82.3576,29.6748,147.0,1990-01-02,0.00,35.3,47.1,58.9,34.9,0.93,10.33,1990-01,1990,1990-01,1990
2,Alachua,-82.3576,29.6748,147.0,1990-01-03,0.00,40.0,52.6,65.3,44.0,1.15,11.13,1990-01,1990,1990-01,1990
3,Alachua,-82.3576,29.6748,147.0,1990-01-04,0.00,47.6,60.4,73.1,51.4,0.57,14.42,1990-01,1990,1990-01,1990
4,Alachua,-82.3576,29.6748,147.0,1990-01-05,0.00,52.0,64.4,76.8,60.3,0.47,13.53,1990-01,1990,1990-01,1990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
734181,Washington,-85.6654,30.6106,121.0,1989-12-27,0.00,28.0,44.1,60.1,28.6,0.70,11.94,1989-12,1989,1989-12,1989
734182,Washington,-85.6654,30.6106,121.0,1989-12-28,0.00,29.3,47.3,65.3,36.7,0.31,14.45,1989-12,1989,1989-12,1989
734183,Washington,-85.6654,30.6106,121.0,1989-12-29,0.00,35.3,51.0,66.8,39.5,0.38,14.42,1989-12,1989,1989-12,1989
734184,Washington,-85.6654,30.6106,121.0,1989-12-30,0.00,41.8,54.6,67.5,48.5,0.26,10.85,1989-12,1989,1989-12,1989


### Recent temperatures (2013-2022)

#### Joining up all files into single dataframe

In [4]:
### creates empty list to write into
dfs = []

### reads all files from the folder
files = glob.glob("raw_data/yearly_data/recent/*.csv")

### loop runs through all the files
for file in files:
    ### skips first 10 rows and imports all the files
    df = pd.read_csv(file, skiprows = 10)
    ### appends file to the list
    dfs.append(df)

### joins up the files vertically   
data_rec = pd.concat(dfs, ignore_index = True)

### displays dataframe
data_rec

Unnamed: 0,Name,Longitude,Latitude,Elevation (ft),Date,ppt (inches),tmin (degrees F),tmean (degrees F),tmax (degrees F),tdmean (degrees F),vpdmin (hPa),vpdmax (hPa)
0,Alachua,-82.3576,29.6748,147.0,2020-01-01,0.00,40.0,52.8,65.5,39.8,0.22,14.04
1,Alachua,-82.3576,29.6748,147.0,2020-01-02,0.00,42.5,54.0,65.6,40.8,0.52,14.92
2,Alachua,-82.3576,29.6748,147.0,2020-01-03,0.00,46.8,62.1,77.5,57.1,0.42,17.98
3,Alachua,-82.3576,29.6748,147.0,2020-01-04,0.09,67.7,75.2,82.6,68.9,0.75,13.35
4,Alachua,-82.3576,29.6748,147.0,2020-01-05,0.57,40.7,56.8,73.0,50.4,0.62,9.00
...,...,...,...,...,...,...,...,...,...,...,...,...
244679,Washington,-85.6654,30.6106,121.0,2013-12-27,0.00,37.1,47.9,58.7,39.1,0.55,7.94
244680,Washington,-85.6654,30.6106,121.0,2013-12-28,0.00,44.1,51.7,59.2,38.4,1.55,9.30
244681,Washington,-85.6654,30.6106,121.0,2013-12-29,2.09,47.3,54.1,61.0,49.4,0.29,4.85
244682,Washington,-85.6654,30.6106,121.0,2013-12-30,0.02,37.0,46.8,56.6,43.7,0.21,4.11


#### Modifying dataframe

In [5]:
### renames columns
data_rec = data_rec.rename(columns = {"Name": "COUNTY", "Longitude": "LONG",
                                        "Latitude": "LAT", "Elevation (ft)": "ELEV", 
                                        "Date": "DATE", "ppt (inches)": "RAINFALL", 
                                        "tmin (degrees F)": "TMIN", "tmean (degrees F)": "TMEAN", 
                                        "tmax (degrees F)": "TMAX", "tdmean (degrees F)": "TDMEAN",
                                        "vpdmin (hPa)": "VPDMIN", "vpdmax (hPa)": "VPDMAX"})
### converts date to date-time format
data_rec["DATE"] = pd.to_datetime(data_rec["DATE"])

### converts and stores dates in YYYY and MM-YYYY formats
data_rec["MONTH_YEAR"] = pd.to_datetime(data_rec["DATE"]).dt.to_period("M")
data_rec["YEAR"] = pd.to_datetime(data_rec["DATE"]).dt.to_period("Y")

### converts and stores the dates as strings
data_rec["MONTH_YEAR_STR"] = data_rec["MONTH_YEAR"].astype(str)
data_rec["YEAR_STR"] = data_rec["YEAR"].astype(str)

### displays dataframe
data_rec

Unnamed: 0,COUNTY,LONG,LAT,ELEV,DATE,RAINFALL,TMIN,TMEAN,TMAX,TDMEAN,VPDMIN,VPDMAX,MONTH_YEAR,YEAR,MONTH_YEAR_STR,YEAR_STR
0,Alachua,-82.3576,29.6748,147.0,2020-01-01,0.00,40.0,52.8,65.5,39.8,0.22,14.04,2020-01,2020,2020-01,2020
1,Alachua,-82.3576,29.6748,147.0,2020-01-02,0.00,42.5,54.0,65.6,40.8,0.52,14.92,2020-01,2020,2020-01,2020
2,Alachua,-82.3576,29.6748,147.0,2020-01-03,0.00,46.8,62.1,77.5,57.1,0.42,17.98,2020-01,2020,2020-01,2020
3,Alachua,-82.3576,29.6748,147.0,2020-01-04,0.09,67.7,75.2,82.6,68.9,0.75,13.35,2020-01,2020,2020-01,2020
4,Alachua,-82.3576,29.6748,147.0,2020-01-05,0.57,40.7,56.8,73.0,50.4,0.62,9.00,2020-01,2020,2020-01,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244679,Washington,-85.6654,30.6106,121.0,2013-12-27,0.00,37.1,47.9,58.7,39.1,0.55,7.94,2013-12,2013,2013-12,2013
244680,Washington,-85.6654,30.6106,121.0,2013-12-28,0.00,44.1,51.7,59.2,38.4,1.55,9.30,2013-12,2013,2013-12,2013
244681,Washington,-85.6654,30.6106,121.0,2013-12-29,2.09,47.3,54.1,61.0,49.4,0.29,4.85,2013-12,2013,2013-12,2013
244682,Washington,-85.6654,30.6106,121.0,2013-12-30,0.02,37.0,46.8,56.6,43.7,0.21,4.11,2013-12,2013,2013-12,2013


### Latest temperatures (January 1, 2023-)

#### Importing data

In [6]:
data_lat = pd.read_csv("raw_data/yearly_data/2023_latest/2023.csv", skiprows = 10)

data_lat

Unnamed: 0,Name,Longitude,Latitude,Elevation (ft),Date,ppt (inches),tmin (degrees F),tmean (degrees F),tmax (degrees F),tdmean (degrees F),vpdmin (hPa),vpdmax (hPa)
0,Alachua,-82.3576,29.6748,147.0,2023-01-01,0.10,62.0,66.3,70.7,62.2,2.40,4.61
1,Alachua,-82.3576,29.6748,147.0,2023-01-02,0.00,56.8,66.1,75.4,60.5,2.42,8.18
2,Alachua,-82.3576,29.6748,147.0,2023-01-03,0.00,56.5,67.8,79.0,61.4,1.98,12.30
3,Alachua,-82.3576,29.6748,147.0,2023-01-04,0.00,59.9,70.0,80.1,64.0,1.79,13.07
4,Alachua,-82.3576,29.6748,147.0,2023-01-05,0.60,62.4,70.8,79.2,63.9,1.68,9.90
...,...,...,...,...,...,...,...,...,...,...,...,...
11050,Washington,-85.6654,30.6106,121.0,2023-06-10,0.15,67.1,79.7,92.2,70.6,0.89,26.09
11051,Washington,-85.6654,30.6106,121.0,2023-06-11,0.00,70.2,80.4,90.6,68.3,0.32,28.20
11052,Washington,-85.6654,30.6106,121.0,2023-06-12,0.00,72.8,81.4,90.1,72.2,0.71,24.22
11053,Washington,-85.6654,30.6106,121.0,2023-06-13,0.04,67.4,80.0,92.6,70.8,0.64,25.75


#### Modifying dataframe

In [7]:
### renames columns
data_lat = data_lat.rename(columns = {"Name": "COUNTY", "Longitude": "LONG",
                                        "Latitude": "LAT", "Elevation (ft)": "ELEV", 
                                        "Date": "DATE", "ppt (inches)": "RAINFALL", 
                                        "tmin (degrees F)": "TMIN", "tmean (degrees F)": "TMEAN", 
                                        "tmax (degrees F)": "TMAX", "tdmean (degrees F)": "TDMEAN",
                                        "vpdmin (hPa)": "VPDMIN", "vpdmax (hPa)": "VPDMAX"})
### converts date to date-time format
data_lat["DATE"] = pd.to_datetime(data_lat["DATE"])

### converts and stores dates in YYYY and MM-YYYY formats
data_lat["MONTH_YEAR"] = pd.to_datetime(data_lat["DATE"]).dt.to_period("M")
data_lat["YEAR"] = pd.to_datetime(data_lat["DATE"]).dt.to_period("Y")

### converts and stores the dates as strings
data_lat["MONTH_YEAR_STR"] = data_lat["MONTH_YEAR"].astype(str)
data_lat["YEAR_STR"] = data_lat["YEAR"].astype(str)

### displays dataframe
data_lat

Unnamed: 0,COUNTY,LONG,LAT,ELEV,DATE,RAINFALL,TMIN,TMEAN,TMAX,TDMEAN,VPDMIN,VPDMAX,MONTH_YEAR,YEAR,MONTH_YEAR_STR,YEAR_STR
0,Alachua,-82.3576,29.6748,147.0,2023-01-01,0.10,62.0,66.3,70.7,62.2,2.40,4.61,2023-01,2023,2023-01,2023
1,Alachua,-82.3576,29.6748,147.0,2023-01-02,0.00,56.8,66.1,75.4,60.5,2.42,8.18,2023-01,2023,2023-01,2023
2,Alachua,-82.3576,29.6748,147.0,2023-01-03,0.00,56.5,67.8,79.0,61.4,1.98,12.30,2023-01,2023,2023-01,2023
3,Alachua,-82.3576,29.6748,147.0,2023-01-04,0.00,59.9,70.0,80.1,64.0,1.79,13.07,2023-01,2023,2023-01,2023
4,Alachua,-82.3576,29.6748,147.0,2023-01-05,0.60,62.4,70.8,79.2,63.9,1.68,9.90,2023-01,2023,2023-01,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,Washington,-85.6654,30.6106,121.0,2023-06-10,0.15,67.1,79.7,92.2,70.6,0.89,26.09,2023-06,2023,2023-06,2023
11051,Washington,-85.6654,30.6106,121.0,2023-06-11,0.00,70.2,80.4,90.6,68.3,0.32,28.20,2023-06,2023,2023-06,2023
11052,Washington,-85.6654,30.6106,121.0,2023-06-12,0.00,72.8,81.4,90.1,72.2,0.71,24.22,2023-06,2023,2023-06,2023
11053,Washington,-85.6654,30.6106,121.0,2023-06-13,0.04,67.4,80.0,92.6,70.8,0.64,25.75,2023-06,2023,2023-06,2023


## Exporting master dataframes

In [8]:
data_hist.to_csv("raw_data/master_hist.csv", index = False)
data_rec.to_csv("raw_data/master_rec.csv", index = False)
data_lat.to_csv("raw_data/master_lat.csv", index = False)