In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import re
import math

%config InlineBackend.figure_format = 'retina'
plt.rc('xtick', labelsize=12) 
plt.rc('ytick', labelsize=12)

pd.set_option('display.max_columns', 500, 'display.max_rows', 500, 'display.width', 1000)

In [75]:
weather = pd.read_csv('../data/weather.csv')

In [76]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Station      2944 non-null   int64  
 1   Date         2944 non-null   object 
 2   Tmax         2944 non-null   int64  
 3   Tmin         2944 non-null   int64  
 4   Tavg         2944 non-null   object 
 5   Depart       2944 non-null   object 
 6   DewPoint     2944 non-null   int64  
 7   WetBulb      2944 non-null   object 
 8   Heat         2944 non-null   object 
 9   Cool         2944 non-null   object 
 10  Sunrise      2944 non-null   object 
 11  Sunset       2944 non-null   object 
 12  CodeSum      2944 non-null   object 
 13  Depth        2944 non-null   object 
 14  Water1       2944 non-null   object 
 15  SnowFall     2944 non-null   object 
 16  PrecipTotal  2944 non-null   object 
 17  StnPressure  2944 non-null   object 
 18  SeaLevel     2944 non-null   object 
 19  Result

In [77]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,0446,1851,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [78]:
#changing columns to lowercase
weather.columns = weather.columns.str.lower()

In [79]:
#checking on preciptotal since it is an object

display(weather['preciptotal'].value_counts())

#replacing preciptotal
weather['preciptotal'].replace('  T', 0.005, inplace=True)

#https://www.chicagotribune.com/news/ct-xpm-2011-04-16-ct-wea-0417-asktom-20110416-story.html

0.00    1577
  T      318
0.01     127
0.02      63
0.03      46
0.04      36
0.05      32
0.08      28
0.12      28
0.06      27
0.07      23
0.16      21
0.09      21
0.11      20
0.14      20
0.17      17
0.28      15
0.13      14
0.19      14
0.18      14
0.20      13
0.15      13
0.23      11
0.25      11
0.26      11
0.10      10
0.24      10
0.31       9
0.39       9
0.40       9
0.21       9
0.29       9
0.43       9
0.34       8
0.32       8
0.22       8
0.33       7
0.41       7
0.50       7
0.48       7
0.37       7
0.45       7
0.59       7
0.30       7
0.80       6
0.63       6
0.84       6
0.65       6
0.27       6
0.92       5
0.68       5
0.44       5
0.36       5
0.54       5
0.93       5
0.85       5
1.23       4
0.72       4
0.75       4
0.64       4
0.52       4
0.70       4
0.55       4
0.97       4
0.51       4
0.58       4
0.89       4
0.66       4
0.87       3
0.60       3
1.31       3
1.05       3
1.01       3
0.77       3
1.03       3
0.56       3
0.88       3

In [80]:
# we see that there are both - and M values. Let us standardize them
weather = weather.replace({'-':np.nan,'M': np.nan})

In [81]:
#changing date to dt object
weather['date'] = pd.to_datetime(weather.date)

#creating new date columns to see seasonal/time trends
weather['year'] = weather.date.dt.year
weather['month'] = weather.date.dt.month
weather['weekday'] = weather['date'].apply(dt.date.isoweekday)

In [82]:
# check na

display(weather.isna().sum())

station           0
date              0
tmax              0
tmin              0
tavg             11
depart         1472
dewpoint          0
wetbulb           4
heat             11
cool             11
sunrise        1472
sunset         1472
codesum           0
depth          1472
water1         2944
snowfall       1472
preciptotal       2
stnpressure       4
sealevel          9
resultspeed       0
resultdir         0
avgspeed          3
year              0
month             0
weekday           0
dtype: int64

In [97]:
# inspecting tavg na values
display(weather[weather['tavg'].isna()])

#prove that tavg is ceil average of tmax and tmin
all(np.ceil((weather[weather['tavg'].notna()]['tmax']+weather[weather['tavg'].notna()]['tmin'])/2)
    == weather[weather['tavg'].notna()]['tavg'].astype(float))

# since tavg is the ceil average of tmin and tmax, we will impute the na values as so

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,depth,water1,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
7,2,2007-05-04,78,51,,,42,50,,,,,,,,,0.0,29.36,30.04,10.1,7,10.4,2007,5,5
505,2,2008-07-08,86,46,,,68,71,,,,,TS RA,,,,0.28,29.16,29.8,7.4,24,8.3,2008,7,2
675,2,2008-10-01,62,46,,,41,47,,,,,,,,,0.0,29.3,29.96,10.9,33,11.0,2008,10,3
1637,2,2011-07-22,100,71,,,70,74,,,,,TS TSRA BR,,,,0.14,29.23,29.86,3.8,10,8.2,2011,7,5
2067,2,2012-08-22,84,72,,,51,61,,,,,,,,,0.0,29.39,,4.7,19,,2012,8,3
2211,2,2013-05-02,71,42,,,39,45,,,,,,,,,0.0,29.51,30.17,15.8,2,16.1,2013,5,4
2501,2,2013-09-24,91,52,,,48,54,,,,,,,,,0.0,29.33,30.0,5.8,9,7.7,2013,9,2
2511,2,2013-09-29,84,53,,,48,54,,,,,RA BR,,,,0.22,29.36,30.01,6.3,36,7.8,2013,9,7
2525,2,2013-10-06,76,48,,,44,50,,,,,RA DZ BR,,,,0.06,29.1,29.76,10.1,25,10.6,2013,10,7
2579,2,2014-05-02,80,47,,,43,47,,,,,RA,,,,0.04,29.1,29.79,10.7,23,11.9,2014,5,5


True

In [17]:
# imputing tavg with avg with tmin,tmax

weather.loc[weather['tavg'].isna(),'tavg'] = np.ceil((weather[weather['tavg'].isna()]['tmax']+weather[weather['tavg'].isna()]['tmin'])/2).astype(int)

In [16]:
# Check on weather water 1 values

display(weather.water1.value_counts())

weather = weather.drop('water1',axis=1)

# dropping the whole of Water1 as it has only 1 single variable - M for Missing

Series([], Name: water1, dtype: int64)

In [13]:
# Checking on weather depth values

display(weather.depth.value_counts())
display(weather.station.value_counts())

# We check to see if the values are of similar rows i.e. they correspond to one another

print(all(weather[weather['depth']=='0'].index == weather[weather['station']==1].index))

weather = weather.drop('depth',axis=1)

# Since the indexes are the same, we drop them to prevent high correlation between the columns

0    1472
Name: depth, dtype: int64

2    1472
1    1472
Name: station, dtype: int64

True


In [14]:
# check to see if na values are alternate
print(weather[weather['depart'].isna()]['station'].value_counts())
print(weather[weather['sunset'].isna()]['station'].value_counts())
print(weather[weather['sunrise'].isna()]['station'].value_counts())

# we notice that the depart columns has na values half of our df length. 
# Hence, we decided to check if it is alternate. 
# we found out that indeed, they are alternate and are values mostly missing from station 2
# for departing from normal temperature, we already have temperature average, so we can drop this
# for sunset and sunrise, we can assume that they are similar for both stations, and hence we will do a ffill

2    1472
Name: station, dtype: int64
2    1472
Name: station, dtype: int64
2    1472
Name: station, dtype: int64


In [15]:
# Checking on snowfall values

display(weather['snowfall'].value_counts())

weather = weather.drop('snowfall',axis=1)

# As there is no snow for the collection months of May-Oct, they are mostly M and 0 values. 
# We can hence drop this column


0.0    1459
  T      12
0.1       1
Name: snowfall, dtype: int64

In [16]:
# dropping depart form normal temp column
weather = weather.drop('depart',axis=1)

# ffill for sunset and sunrise

weather[['sunset','sunrise']]= weather[['sunset','sunrise']].fillna(method = 'ffill')

In [17]:
#reformat time
formatted_time = []
for time in weather['sunrise']:
    new_time = re.sub(r'(\d\d)(\d\d)',r'\1:\2',time)
    formatted_time.append(new_time)
    
weather['sunrise'] = formatted_time

In [18]:
#change to dt object
weather['sunrise']= pd.to_datetime(weather['sunrise']).dt.time
weather['sunrise']

0       04:48:00
1       04:48:00
2       04:47:00
3       04:47:00
4       04:46:00
          ...   
2939    06:20:00
2940    06:22:00
2941    06:22:00
2942    06:23:00
2943    06:23:00
Name: sunrise, Length: 2944, dtype: object

In [19]:
# discovered erroneous sunset timings of 1860. Making an intelligent guess that it is 1900
weather['sunset'].replace('1860','1900',inplace=True)
weather['sunset'].replace('1760','1800',inplace=True)
weather['sunset'].replace('1660','1700',inplace=True)

#reformatting time
formatted_time = []
for time in weather['sunset']:
    new_time = re.sub(r'(\d\d)(\d\d)',r'\1:\2',time)
    formatted_time.append(new_time)
    
weather['sunset'] = formatted_time

#changed to dt object
weather['sunset']= pd.to_datetime(weather['sunset']).dt.time
weather['sunset']

0       18:49:00
1       18:49:00
2       18:50:00
3       18:50:00
4       18:51:00
          ...   
2939    16:50:00
2940    16:49:00
2941    16:49:00
2942    16:47:00
2943    16:47:00
Name: sunset, Length: 2944, dtype: object

In [20]:
# From the noaa_weather_qclcd_documentation, codesum ' ' is equals to moderate

weather['codesum'] = weather['codesum'].replace(' ', 'Moderate')

In [21]:
# split codesum from space separated into a list

weather['codesum'] = weather['codesum'].str.split(' ')

In [22]:
# Checking the rest of the na values
display(weather['stnpressure'].isna().sum())
display(weather[weather['stnpressure'].isna()])

4

Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
87,2,2007-06-13,86,68,77,53,62.0,0,12,04:16:00,19:27:00,[Moderate],0.0,,,7.0,5,,2007,6,3
848,1,2009-06-26,86,69,78,60,,0,13,04:18:00,19:31:00,[Moderate],0.0,,29.85,6.4,4,8.2,2009,6,5
2410,1,2013-08-10,81,64,73,57,,0,8,04:54:00,19:00:00,[Moderate],0.0,,30.08,5.3,5,6.5,2013,8,6
2411,2,2013-08-10,81,68,75,55,63.0,0,10,04:54:00,19:00:00,[Moderate],0.0,,30.07,6.0,6,7.4,2013,8,6


In [23]:
weather['sealevel'].isna().sum()
display(weather[weather['sealevel'].isna()])

Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
87,2,2007-06-13,86,68,77.0,53,62,0.0,12.0,04:16:00,19:27:00,[Moderate],0.0,,,7.0,5,,2007,6,3
832,1,2009-06-18,80,61,71.0,63,67,0.0,6.0,04:16:00,19:29:00,"[RA, BR]",0.12,29.08,,6.7,16,7.9,2009,6,4
994,1,2009-09-07,77,59,68.0,59,62,0.0,3.0,05:23:00,18:17:00,[BR],0.0,29.39,,5.8,3,4.0,2009,9,1
1732,1,2011-09-08,75,57,66.0,53,59,0.0,1.0,05:24:00,18:15:00,[RA],0.005,29.34,,13.0,2,13.4,2011,9,4
1745,2,2011-09-14,60,48,54.0,45,51,11.0,0.0,05:30:00,18:05:00,"[RA, BR, HZ, FU]",0.005,29.47,,6.0,32,,2011,9,3
1756,1,2011-09-20,74,49,62.0,54,58,3.0,0.0,05:37:00,17:53:00,"[MIFG, BCFG, BR]",0.0,29.26,,7.3,18,7.3,2011,9,2
2067,2,2012-08-22,84,72,78.0,51,61,,,05:06:00,18:43:00,[Moderate],0.0,29.39,,4.7,19,,2012,8,3
2090,1,2012-09-03,88,71,80.0,70,73,0.0,15.0,05:19:00,18:24:00,[BR],0.0,29.17,,4.6,6,4.4,2012,9,1
2743,2,2014-07-23,76,64,70.0,56,61,0.0,5.0,04:36:00,19:20:00,[Moderate],0.0,29.47,,16.4,2,16.7,2014,7,3


In [24]:
display(weather['wetbulb'].isna().sum())
display(weather[weather['wetbulb'].isna()])

# we see that these 3 columns of stnpressure,have very little na values. 
# We will fill them with the median of the month

4

Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
848,1,2009-06-26,86,69,78,60,,0,13,04:18:00,19:31:00,[Moderate],0.0,,29.85,6.4,4,8.2,2009,6,5
2410,1,2013-08-10,81,64,73,57,,0,8,04:54:00,19:00:00,[Moderate],0.0,,30.08,5.3,5,6.5,2013,8,6
2412,1,2013-08-11,81,60,71,61,,0,6,04:55:00,18:59:00,[RA],0.01,29.35,30.07,2.0,27,3.0,2013,8,7
2415,2,2013-08-12,85,69,77,63,,0,12,04:56:00,18:58:00,[RA],0.66,29.27,29.92,4.5,26,7.7,2013,8,1


In [25]:
# Impute the median of wetbulb according to the median of their months

display(weather.loc[weather['wetbulb'].isna(),'date'].dt.month)

wb_june_median = weather[weather['month'] == 6]['wetbulb'].median()
wb_aug_median = weather[weather['month'] == 8]['wetbulb'].median()

weather.loc[848, 'wetbulb'] = wb_june_median
weather.loc[2410, 'wetbulb'] = wb_aug_median 
weather.loc[2412, 'wetbulb'] = wb_aug_median 
weather.loc[2415, 'wetbulb'] = wb_aug_median 

848     6
2410    8
2412    8
2415    8
Name: date, dtype: int64

In [26]:
# Impute the median of stnpressure according to the median of their months

print(weather.loc[weather['stnpressure'].isna(),'date'].dt.month)

sp_jun_median = round(weather[weather['month'] == 6]['stnpressure'].median(),2)
sp_aug_median = round(weather[weather['month'] == 8]['stnpressure'].median(),2)

weather.loc[87, 'stnpressure'] = sp_jun_median
weather.loc[848, 'stnpressure'] = sp_jun_median
weather.loc[2410, 'stnpressure'] = sp_aug_median
weather.loc[2411, 'stnpressure'] = sp_aug_median

87      6
848     6
2410    8
2411    8
Name: date, dtype: int64


In [27]:
# Impute the median of sealevel according to the median of their months

print(weather.loc[weather['sealevel'].isna(),'date'].dt.month)

sl_jun_median = round(weather[weather['month'] == 6]['sealevel'].median(),2)
sl_jul_median = round(weather[weather['month'] == 7]['sealevel'].median(),2)
sl_aug_median = round(weather[weather['month'] == 8]['sealevel'].median(),2)
sl_sep_median = round(weather[weather['month'] == 9]['sealevel'].median(),2)

weather.loc[87, 'sealevel'] = sl_jun_median
weather.loc[832, 'sealevel'] = sl_jun_median
weather.loc[2743, 'sealevel'] = sl_jul_median
weather.loc[2067, 'sealevel'] = sl_aug_median 
weather.loc[994, 'sealevel'] = sl_sep_median 
weather.loc[1732, 'sealevel'] = sl_sep_median 
weather.loc[1745, 'sealevel'] = sl_sep_median 
weather.loc[1756, 'sealevel'] = sl_sep_median 
weather.loc[2090, 'sealevel'] = sl_sep_median 

87      6
832     6
994     9
1732    9
1745    9
1756    9
2067    8
2090    9
2743    7
Name: date, dtype: int64


In [28]:
# dealing with null values from precip total
precip_na_index = weather[weather['preciptotal'].isna()].index

date1, date2 = weather.loc[precip_na_index]['date'].values

display(weather[weather['date'] == date1])

display(weather[weather['date'] == date1])

# imputing np.nan as 0 value

weather['preciptotal'] = weather['preciptotal'].fillna(0)

# we see that for the nan values in precip total belonging to station 2, the corresponding date on station 1 is 0
# hence, an intelligent guess would be that there is no rain that day and we imputed as so

Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
116,1,2007-06-28,74,58,66,55,60,0,1,04:19:00,19:31:00,[Moderate],0.0,29.41,30.09,11.9,3,12.5,2007,6,4
117,2,2007-06-28,73,61,67,56,61,0,2,04:19:00,19:31:00,[Moderate],,29.43,30.07,12.2,2,13.3,2007,6,4


Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
116,1,2007-06-28,74,58,66,55,60,0,1,04:19:00,19:31:00,[Moderate],0.0,29.41,30.09,11.9,3,12.5,2007,6,4
117,2,2007-06-28,73,61,67,56,61,0,2,04:19:00,19:31:00,[Moderate],,29.43,30.07,12.2,2,13.3,2007,6,4


In [29]:
# Convert all the strings to appropriate int type

weather['tavg'] = weather['tavg'].astype(float)
weather['wetbulb'] = weather['wetbulb'].astype(float)
weather['heat'] = weather['heat'].astype(float)
weather['cool'] = weather['cool'].astype(float)
weather['preciptotal'] = weather['preciptotal'].astype(float)
weather['stnpressure'] = weather['stnpressure'].astype(float)
weather['sealevel'] = weather['sealevel'].astype(float)
weather['avgspeed'] = weather['avgspeed'].astype(float)

In [30]:
# the average speed is the wind speed. let us impute these na values

display(weather[weather['avgspeed'].isna()])
display(f'averagespeed b/t stations: {weather["avgspeed"].diff().mean()}')

# using station 1 to predict station 2
weather['avgspeed'] = weather['avgspeed'].fillna(method='ffill')

# as we can see,, the average mean diff between station 1 and 2  is minimal. 
# Hence, we can use the station 1 value to predict station 2 value

Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
87,2,2007-06-13,86,68,77.0,53,62.0,0.0,12.0,04:16:00,19:27:00,[Moderate],0.0,29.23,29.92,7.0,5,,2007,6,3
1745,2,2011-09-14,60,48,54.0,45,51.0,11.0,0.0,05:30:00,18:05:00,"[RA, BR, HZ, FU]",0.005,29.47,30.03,6.0,32,,2011,9,3
2067,2,2012-08-22,84,72,78.0,51,61.0,,,05:06:00,18:43:00,[Moderate],0.0,29.39,29.97,4.7,19,,2012,8,3


'averagespeed b/t stations: 0.003983656792645559'

In [31]:
# any temp average value above 65 will be recorded as cool, and vice versa heat
# impute na values for heat and cool

def impute_heat_cool(row):
    
    if math.isnan(row['heat']):

        temp = 65 - row['tavg']
        
        if temp<0:
            row['heat'] = 0
            row['cool'] = temp
            
        if temp>0:
            row['cool']=0
            row['heat'] = temp
        else:
            row['heat'] = row['cool']= 0
        
    if math.isnan(row['cool']):
        
        temp = 65 - row['tavg']
        
        if temp<0:
            row['heat'] = 0
            row['cool'] = temp
            
        if temp>0:
            row['cool']=0
            row['heat'] = temp
            
    return row 
        

In [32]:
weather = weather.apply(impute_heat_cool,axis=1)

In [33]:
weather.isna().sum()

station        0
date           0
tmax           0
tmin           0
tavg           0
dewpoint       0
wetbulb        0
heat           0
cool           0
sunrise        0
sunset         0
codesum        0
preciptotal    0
stnpressure    0
sealevel       0
resultspeed    0
resultdir      0
avgspeed       0
year           0
month          0
weekday        0
dtype: int64

In [34]:
#now that we have cleaned the data, we can save to csv file

weather.to_csv('../data/weather_clean.csv',index=False)

In [35]:
pd.read_csv('../data/weather_clean.csv')

Unnamed: 0,station,date,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,weekday
0,1,2007-05-01,83,50,67.0,51,56.0,0.0,2.0,04:48:00,18:49:00,['Moderate'],0.000,29.10,29.82,1.7,27,9.2,2007,5,2
1,2,2007-05-01,84,52,68.0,51,57.0,0.0,3.0,04:48:00,18:49:00,['Moderate'],0.000,29.18,29.82,2.7,25,9.6,2007,5,2
2,1,2007-05-02,59,42,51.0,42,47.0,14.0,0.0,04:47:00,18:50:00,['BR'],0.000,29.38,30.09,13.0,4,13.4,2007,5,3
3,2,2007-05-02,60,43,52.0,42,47.0,13.0,0.0,04:47:00,18:50:00,"['BR', 'HZ']",0.000,29.44,30.08,13.3,2,13.4,2007,5,3
4,1,2007-05-03,66,46,56.0,40,48.0,9.0,0.0,04:46:00,18:51:00,['Moderate'],0.000,29.39,30.12,11.7,7,11.9,2007,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,2,2014-10-29,49,40,45.0,34,42.0,20.0,0.0,06:20:00,16:50:00,['Moderate'],0.000,29.42,30.07,8.5,29,9.0,2014,10,3
2940,1,2014-10-30,51,32,42.0,34,40.0,23.0,0.0,06:22:00,16:49:00,['Moderate'],0.000,29.34,30.09,5.1,24,5.5,2014,10,4
2941,2,2014-10-30,53,37,45.0,35,42.0,20.0,0.0,06:22:00,16:49:00,['RA'],0.005,29.41,30.10,5.9,23,6.5,2014,10,4
2942,1,2014-10-31,47,33,40.0,25,33.0,25.0,0.0,06:23:00,16:47:00,"['RA', 'SN']",0.030,29.49,30.20,22.6,34,22.9,2014,10,5
