In [6]:
import pandas as pd
import numpy as np
import csv
import time

print(time.ctime())
# This setting helps to display all the columns in the dataframe
pd.set_option('display.max_columns', None)

# Define the CSV file path
file_path = './data/noaa.csv'

# Define column names as there are no headers in the CSV file
columns = ['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', 'TEMP',
           'TEMP_ATTRIBUTES', 'DEWP', 'DEWP_ATTRIBUTES', 'SLP', 'SLP_ATTRIBUTES',
           'STP', 'STP_ATTRIBUTES', 'VISIB', 'VISIB_ATTRIBUTES', 'WDSP', 'WDSP_ATTRIBUTES',
           'MXSPD', 'GUST', 'MAX', 'MAX_ATTRIBUTES', 'MIN', 'MIN_ATTRIBUTES',
           'PRCP', 'PRCP_ATTRIBUTES', 'SNDP', 'FRSHTT']

# Read the CSV file
data = pd.read_csv(file_path, header=None, names=columns, low_memory=False)
print(data.head())


      STATION        DATE   LATITUDE  LONGITUDE  ELEVATION  \
0   841599999  2020-03-13        NaN        NaN        NaN   
1   841599999  2020-03-14        NaN        NaN        NaN   
2  1001099999  2023-01-01  70.933333  -8.666667        9.0   
3  1001099999  2023-01-02  70.933333  -8.666667        9.0   
4  1001099999  2023-01-03  70.933333  -8.666667        9.0   

                     NAME  TEMP  TEMP_ATTRIBUTES  DEWP  DEWP_ATTRIBUTES  \
0                    XM21  82.7                6  73.2                6   
1                    XM21  83.1                8  72.4                8   
2  JAN MAYEN NOR NAVY, NO  18.1               24  10.3               24   
3  JAN MAYEN NOR NAVY, NO  25.8               24  20.8               24   
4  JAN MAYEN NOR NAVY, NO  35.8               24  30.9               24   

      SLP  SLP_ATTRIBUTES    STP  STP_ATTRIBUTES  VISIB  VISIB_ATTRIBUTES  \
0  1010.9               6  999.9               0  999.9                 0   
1  1008.7             

In [7]:
# Cell 2: Filter out unnecessary columns

# Dropping columns that are not required for the analysis
data = data.drop(columns=['TEMP_ATTRIBUTES', 'DEWP_ATTRIBUTES', 'SLP_ATTRIBUTES', 'STP_ATTRIBUTES',
                          'VISIB_ATTRIBUTES', 'WDSP_ATTRIBUTES', 'MAX_ATTRIBUTES', 'MIN_ATTRIBUTES',
                          'PRCP_ATTRIBUTES', 'SNDP', 'FRSHTT'])
print(data.head())

      STATION        DATE   LATITUDE  LONGITUDE  ELEVATION  \
0   841599999  2020-03-13        NaN        NaN        NaN   
1   841599999  2020-03-14        NaN        NaN        NaN   
2  1001099999  2023-01-01  70.933333  -8.666667        9.0   
3  1001099999  2023-01-02  70.933333  -8.666667        9.0   
4  1001099999  2023-01-03  70.933333  -8.666667        9.0   

                     NAME  TEMP  DEWP     SLP    STP  VISIB  WDSP  MXSPD  \
0                    XM21  82.7  73.2  1010.9  999.9  999.9  14.5   17.1   
1                    XM21  83.1  72.4  1008.7  999.9  999.9  11.0   14.0   
2  JAN MAYEN NOR NAVY, NO  18.1  10.3   981.4  980.2    4.4  18.5   27.6   
3  JAN MAYEN NOR NAVY, NO  25.8  20.8   999.6  998.4    4.1   6.5   16.7   
4  JAN MAYEN NOR NAVY, NO  35.8  30.9  1006.1    4.9    6.4  21.1   26.4   

   GUST   MAX   MIN  PRCP  
0  21.0  83.3  82.2  0.00  
1  17.1  83.7  82.4  0.00  
2  51.3  23.5  12.6  0.29  
3  24.7  34.3  19.4  0.05  
4  34.2  37.6  24.6  0.07  


In [9]:
# Cell 3: Handle Missing Values
# Replace placeholder values 999.9 and 9999.9 with NaN
placeholder_values = [9999.9, 999.9]  # Adjust as needed
for value in placeholder_values:
    data.replace(value, np.nan, inplace=True)

# Fill missing values or drop them based on the analysis requirement
# For instance, filling missing temperature values with the average temperature
data['TEMP'] = data['TEMP'].fillna(data['TEMP'].mean())
print(data.head())
# Remove rows where any of the remaining relevant columns are still missing
data.dropna(subset=['DEWP', 'SLP', 'STP', 'VISIB', 'WDSP', 'MXSPD', 'MAX', 'MIN', 'PRCP'], inplace=True)

      STATION        DATE   LATITUDE  LONGITUDE  ELEVATION  \
2  1001099999  2023-01-01  70.933333  -8.666667        9.0   
3  1001099999  2023-01-02  70.933333  -8.666667        9.0   
4  1001099999  2023-01-03  70.933333  -8.666667        9.0   
5  1001099999  2023-01-04  70.933333  -8.666667        9.0   
6  1001099999  2023-01-05  70.933333  -8.666667        9.0   

                     NAME  TEMP  DEWP     SLP    STP  VISIB  WDSP  MXSPD  \
2  JAN MAYEN NOR NAVY, NO  18.1  10.3   981.4  980.2    4.4  18.5   27.6   
3  JAN MAYEN NOR NAVY, NO  25.8  20.8   999.6  998.4    4.1   6.5   16.7   
4  JAN MAYEN NOR NAVY, NO  35.8  30.9  1006.1    4.9    6.4  21.1   26.4   
5  JAN MAYEN NOR NAVY, NO  37.0  32.1  1010.6    9.4   10.1  17.7   21.2   
6  JAN MAYEN NOR NAVY, NO  35.4  29.7  1015.4   14.2   12.4  15.9   20.8   

   GUST   MAX   MIN  PRCP  
2  51.3  23.5  12.6  0.29  
3  24.7  34.3  19.4  0.05  
4  34.2  37.6  24.6  0.07  
5  33.0  37.8  35.6  0.04  
6  26.0  37.4  32.2  0.01  


In [4]:
# Cell 5: Filter for More Extreme Conditions in Fahrenheit

# Filter for days with extremely high or extremely low temperatures
extreme_temps = data[(data['TEMP'] > 100) | (data['TEMP'] < 20)]
extreme_temps.head()

In [5]:
filename = './data/extreme_weather_conditions.csv'

# Open the file in write mode
with open(filename, mode='w', newline='') as file:
    # Create a CSV writer object
    writer = csv.writer(file)

    # Write the headers
    writer.writerow(extreme_temps.columns.tolist())

    # Write the data rows
    for index, row in extreme_temps.iterrows():
        writer.writerow(row.tolist())

print(f"Data successfully written to {filename}")
print(time.ctime())

Data successfully written to extreme_weather_conditions.csv
