In [6]:
#importing necessary modules
from datetime import datetime as dt
import pandas as pd
import numpy as np
import xarray as xr

import cartopy.crs as ccrs
import cartopy.feature as cfeature

import matplotlib.pyplot as plt

import metpy.calc as mpcalc
import metpy.plots as mpplots

from matplotlib.patheffects import withStroke
from metpy.io import parse_metar_file
from metpy.units import pandas_dataframe_to_unit_arrays
from metpy.cbook import get_test_data
from metpy.units import units

from scipy.stats import zscore
 
from siphon.catalog import TDSCatalog

# Ignore warnings
import warnings

warnings.filterwarnings('ignore')

In [7]:
#Link to datafile: https://drive.google.com/file/d/1kjzuaNx7G6EmjFbIToN6DitoZnbmfy9V/view?usp=sharing

In [8]:
# Open the netCDF file as a xarray Dataset
data = xr.open_dataset('C:\\Users\\Dell\\Documents\\BD final proj\\projdata1.nc4', decode_times = False)

# View a summary of the Dataset
data

In [14]:
print(data.head,'\n\n\n\nOriginal Info:\n\n',data.info)

<bound method Dataset.head of <xarray.Dataset>
Dimensions:   (bnds: 2, lat: 36, lon: 84, time: 1248)
Coordinates:
  * lat       (lat) float64 71.25 70.75 70.25 69.75 ... 55.25 54.75 54.25 53.75
  * lon       (lon) float64 -170.2 -169.8 -169.2 -168.8 ... -129.8 -129.2 -128.8
  * time      (time) int32 0 1 2 3 4 5 6 ... 1241 1242 1243 1244 1245 1246 1247
Dimensions without coordinates: bnds
Data variables:
    irrx      (time, lat, lon) float32 ...
    lat_bnds  (lat, bnds) float64 71.5 71.0 71.0 70.5 ... 54.5 54.0 54.0 53.5
    lon_bnds  (lon, bnds) float64 -170.5 -170.0 -170.0 ... -129.0 -129.0 -128.5
    petx      (time, lat, lon) float32 ...
    pptx      (time, lat, lon) float32 ...
    tmin      (time, lat, lon) float32 ...
    tmax      (time, lat, lon) float32 ...
    rhum      (time, lat, lon) float32 ...
    srad      (time, lat, lon) float32 ...
    vpxx      (time, lat, lon) float32 ...
Attributes:
    Conventions:  CF-1.0
    title:        CGCM1 scenario. mean monthly irradi

In [15]:
data.info()

xarray.Dataset {
dimensions:
	bnds = 2 ;
	lat = 36 ;
	lon = 84 ;
	time = 1248 ;

variables:
	float32 irrx(time, lat, lon) ;
		irrx:units = W m-2 ;
		irrx:long_name = irradiance ;
	float64 lat(lat) ;
		lat:long_name = latitude ;
		lat:units = degrees_north ;
		lat:bounds = lat_bnds ;
	float64 lat_bnds(lat, bnds) ;
	float64 lon(lon) ;
		lon:long_name = longitude ;
		lon:units = degrees_east ;
		lon:bounds = lon_bnds ;
	float64 lon_bnds(lon, bnds) ;
	int32 time(time) ;
		time:units = months since 1997-01-01 0:0:0 ;
		time:calendar = gregorian ;
		time:standard_name = time ;
	float32 petx(time, lat, lon) ;
		petx:units = kg m-2 d-1 ;
		petx:long_name = potential evapotranspiration ;
	float32 pptx(time, lat, lon) ;
		pptx:units = mm/month ;
		pptx:long_name = precipitation ;
	float32 tmin(time, lat, lon) ;
		tmin:units = degrees C ;
		tmin:long_name = min temperature ;
	float32 tmax(time, lat, lon) ;
		tmax:units = degrees C ;
		tmax:long_name = max temperature ;
	float32 rhum(time, lat, lo

In [16]:
#Converting Data to Pandas
data = data.to_dataframe().reset_index()

In [17]:
#Renaming the column Headers
data = data.rename(columns = {'lat':'Latitude', 'lon':'Longitude', 'irrx':'Irradiation','pptx':'Precipitation (mm/month)','tmin':'Min. Temp (C)','tmax':'Max Temp (C)','rhum':'Relative Humidity', 'petx': 'Potential Evapotranspiration','vpxx':'Vapor Pressure (Pa)', 'srad':'Solar Radiation', 'pptx':'Precipitation'})

In [18]:
#Removing the unneeded columns from data
data.drop(['Irradiation', 'Potential Evapotranspiration'], axis = 1, inplace = True)

In [19]:
#Dropping NaN values
data = data.dropna()
data

Unnamed: 0,bnds,Latitude,Longitude,time,lat_bnds,lon_bnds,Precipitation,Min. Temp (C),Max Temp (C),Relative Humidity,Solar Radiation,Vapor Pressure (Pa)
33696,0,71.25,-156.75,0,71.5,-157.0,22.766439,-24.069483,-16.347624,62.089603,0.156222,106.458366
33697,0,71.25,-156.75,1,71.5,-157.0,8.011784,-26.625763,-22.133951,72.114403,286.946442,88.648010
33698,0,71.25,-156.75,2,71.5,-157.0,13.273505,-28.469589,-20.897623,58.534901,4834.741699,63.520851
33699,0,71.25,-156.75,3,71.5,-157.0,5.970510,-27.356468,-11.623947,32.577370,14312.712891,62.337658
33700,0,71.25,-156.75,4,71.5,-157.0,5.084956,-9.137223,-5.175585,58.283066,8741.326172,231.772751
...,...,...,...,...,...,...,...,...,...,...,...,...
7453051,1,53.75,-166.75,1243,53.5,-166.5,70.250221,13.117846,19.315529,70.699043,17206.587891,1394.431152
7453052,1,53.75,-166.75,1244,53.5,-166.5,204.811478,10.616543,16.842976,72.441734,11470.416992,1214.069214
7453053,1,53.75,-166.75,1245,53.5,-166.5,232.000687,6.845821,12.920119,76.360062,5980.557129,1021.652283
7453054,1,53.75,-166.75,1246,53.5,-166.5,230.464981,2.151087,9.873170,72.611328,3094.046143,769.466736


In [20]:
#Removing outliers from each columnn - if values are more than 3 std deviations from the mean, they will be removed

In [21]:
z = zscore(data)
abs_z = np.abs(z)
filtered_data = (abs_z < 3).all(axis=1)
dataA = data[filtered_data]

In [22]:
dataA

Unnamed: 0,bnds,Latitude,Longitude,time,lat_bnds,lon_bnds,Precipitation,Min. Temp (C),Max Temp (C),Relative Humidity,Solar Radiation,Vapor Pressure (Pa)
33696,0,71.25,-156.75,0,71.5,-157.0,22.766439,-24.069483,-16.347624,62.089603,0.156222,106.458366
33697,0,71.25,-156.75,1,71.5,-157.0,8.011784,-26.625763,-22.133951,72.114403,286.946442,88.648010
33698,0,71.25,-156.75,2,71.5,-157.0,13.273505,-28.469589,-20.897623,58.534901,4834.741699,63.520851
33699,0,71.25,-156.75,3,71.5,-157.0,5.970510,-27.356468,-11.623947,32.577370,14312.712891,62.337658
33700,0,71.25,-156.75,4,71.5,-157.0,5.084956,-9.137223,-5.175585,58.283066,8741.326172,231.772751
...,...,...,...,...,...,...,...,...,...,...,...,...
7453051,1,53.75,-166.75,1243,53.5,-166.5,70.250221,13.117846,19.315529,70.699043,17206.587891,1394.431152
7453052,1,53.75,-166.75,1244,53.5,-166.5,204.811478,10.616543,16.842976,72.441734,11470.416992,1214.069214
7453053,1,53.75,-166.75,1245,53.5,-166.5,232.000687,6.845821,12.920119,76.360062,5980.557129,1021.652283
7453054,1,53.75,-166.75,1246,53.5,-166.5,230.464981,2.151087,9.873170,72.611328,3094.046143,769.466736


In [23]:
print(dataA.shape,'\n\n\nFiltered Data Info:\n',dataA.info)

(4172960, 12) 


Filtered Data Info:
 <bound method DataFrame.info of          bnds  Latitude  Longitude  time  lat_bnds  lon_bnds  Precipitation  \
33696       0     71.25    -156.75     0      71.5    -157.0      22.766439   
33697       0     71.25    -156.75     1      71.5    -157.0       8.011784   
33698       0     71.25    -156.75     2      71.5    -157.0      13.273505   
33699       0     71.25    -156.75     3      71.5    -157.0       5.970510   
33700       0     71.25    -156.75     4      71.5    -157.0       5.084956   
...       ...       ...        ...   ...       ...       ...            ...   
7453051     1     53.75    -166.75  1243      53.5    -166.5      70.250221   
7453052     1     53.75    -166.75  1244      53.5    -166.5     204.811478   
7453053     1     53.75    -166.75  1245      53.5    -166.5     232.000687   
7453054     1     53.75    -166.75  1246      53.5    -166.5     230.464981   
7453055     1     53.75    -166.75  1247      53.5    -166.5 