### Weather Dimension
Xiaoxi Jia

Install the Pandas library

In [7]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Imports

In [8]:
import pandas as pd
import os

In [9]:
parent_folder_path = '/content/drive/MyDrive/Colab Notebooks'
folder_path = os.path.join(parent_folder_path, 'raw_files')
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
header_row = ['code'] + months
# Set the data type of the column 'code' as string and the rest as float
column_dtypes = {col: str if col == 'code' else float for col in header_row}
code_elements = ['state_code', 'county_fips', 'element_code', 'year']
aggregate_headers = ['average_precipitation', 'maximum_temperature', 'minimum_temperature', 'average_temperature' ]
precipitation = 'pcpncy'
maximum_temperature = 'tmaxcy'
minimum_temperature = 'tmincy'
average_temperature = 'tmpccy'

Convert text files into CSV files, deconstruct the code information , and add aggregates columns

In [10]:
for file in os.listdir(folder_path):
  output_file_name = file.replace(".txt", ".csv")
  output_file_path = os.path.join(parent_folder_path, 'processed_files', output_file_name)
  file_path = os.path.join(folder_path, file)
  if os.path.isfile(file_path):
    # Parse the space separated text files and add headers to them
    df = pd.read_csv(file_path, sep='\s+', header=None, dtype=column_dtypes, names=header_row)
    # Extract the detailed information (i.e., 'code_elements') from column 'code' and rename them
    df[code_elements] = df['code'].str.extract(r'^(\d{2})(\d{3})(\d{2})(\d{4})$')
    if precipitation in file:
      # Create and calculate a new column 'average_precipitation' using '7.2f' format
      df[aggregate_headers[0]] = df[months].mean(axis=1).apply(lambda x: format(x, '7.2f'))
    elif maximum_temperature in file:
      # Create and calculate a new column 'maximum_temperature' using '7.2f' format
      df[aggregate_headers[1]] = df[months].max(axis=1).apply(lambda x: format(x, '7.2f'))
    elif minimum_temperature in file:
      # Create and calculate a new column 'minimum_temperature' using '7.2f' format
      df[aggregate_headers[2]] = df[months].min(axis=1).apply(lambda x: format(x, '7.2f'))
    elif average_temperature in file:
      # Create and calculate a new column 'average_temperature' using '7.2f' format
      df[aggregate_headers[3]] = df[months].mean(axis=1).apply(lambda x: format(x, '7.2f'))
    # Output the new csv files
    df.to_csv(output_file_path, index=False)

Generate a transformed CSV file for the weather dimension

In [11]:
# List of data frames
dfs = []
condition = ['state_code', 'county_fips', 'year']
csv_strings = ['code'] + code_elements
pcpncy = condition + [ aggregate_headers[0]]
tmaxcy = condition + [ aggregate_headers[1]]
tmincy = condition + [aggregate_headers[2]]
tmpccy = condition + [aggregate_headers[3]]
csv_header_row = header_row + code_elements + aggregate_headers
output_file_parent_folder = os.path.join(parent_folder_path, 'processed_files')
# Set the data type of 'code'-related columns as string and the rest as float
csv_column_dtypes =  {col: str if col in csv_strings else float for col in csv_header_row}
for file in os.listdir(output_file_parent_folder):
  csv_file_path = os.path.join(output_file_parent_folder, file)
  if os.path.isfile(csv_file_path):
    # Parse the new csv files and add new corresponded headers to them
    dfs.append(pd.read_csv(csv_file_path, dtype=csv_column_dtypes, low_memory=False))
# Extract and join the aggregates and keep the 'code'-related information 
merged_df = dfs[0][pcpncy].merge(dfs[1][tmaxcy], left_on=condition, right_on=condition, how='inner').merge(dfs[2][tmincy], left_on=condition, right_on=condition, how='inner').merge(dfs[3][tmpccy], left_on=condition, right_on=condition, how='inner')
# Filter out the incomplete data
merged_df = merged_df[merged_df['year'] != '2023']
# Output the new aggregated csv files
merged_df.to_csv(os.path.join(parent_folder_path, 'aggregates', 'weather_dimension.csv'), index=False)

The preview of the generated new CSV file of the weather dimension

In [12]:
merged_df

Unnamed: 0,state_code,county_fips,year,average_precipitation,maximum_temperature,minimum_temperature,average_temperature
0,01,001,1895,3.86,91.1,27.7,62.63
1,01,001,1896,3.76,94.5,34.4,65.34
2,01,001,1897,3.70,95.6,33.2,65.15
3,01,001,1898,3.53,93.9,32.7,63.82
4,01,001,1899,3.68,93.7,29.6,63.93
...,...,...,...,...,...,...,...
403767,50,290,2018,1.49,70.2,-9.1,28.52
403768,50,290,2019,1.53,71.9,-11.4,29.92
403769,50,290,2020,1.35,66.0,-22.8,25.18
403770,50,290,2021,1.52,66.8,-17.2,24.57
