In [None]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy.polynomial.polynomial as poly

_data_set = pd.DataFrame(pd.read_csv('../input/surface-solar-radiation-dataset/New folder/DGSR_Data.csv'))

In [None]:
# helpful functions

def add_mean_to_data(dataset, group_by):
    return dataset.groupby([group_by]).agg(Mean=('Estimated DGSR', 'mean')).reset_index()

def mult_degree(x, y, deg, data_label, col, numm=None):
    
    if numm == None:
        numm = 500

    x = np.array(x)
    coeff = poly.polyfit(x, y, deg)
    x_new = np.linspace(x.min(), x.max(), numm)
    ffit = poly.polyval(x_new, coeff)
    plt.plot(x_new, ffit, label=data_label, color=col)
    plt.legend(loc='best')
    
    return ffit

def get_monthly_data(dataset, month):
    _temp = pd.DataFrame(dataset[dataset['Month'] == month]['Estimated DGSR'].reset_index(drop=True))
    _temp.columns = ['e_dgsr_%s' %(month)]
    return _temp

def get_norm(x, data):
    return norm.pdf(x, data.mean(), data.std())

## Getting some information about the dataset

In [None]:
print(_data_set.info())

num_of_entries = _data_set.count()[0]
divisibility = round(round(_data_set.count()[0]/365, 4) - np.floor(_data_set.count()[0]/365), 4)

print('The total number of records is {}, the divisibility to full year is {}'.format(
    num_of_entries, divisibility))

It means there will be days that either have a 0 value or no recorded value at all.

Now, let's see if there is any null value in the dataset

In [None]:
_data_set.isnull().sum(axis = 0)

So we do not have any null value in the data. However, as we saw previously, we might have incomplete values.

In [None]:
year_list = np.unique(np.array(_data_set['Year']))
__year_list = {}
for i in year_list:
    __year_list.update({i: len(np.where(np.array(_data_set['Year']) == i)[0])})
    
month_list = np.unique(np.array(_data_set['Month']))
__month_list = {}
for i in month_list:
    __month_list.update({i: len(np.where(np.array(_data_set['Month'] == i)[0]))})
    
day_list = np.unique(np.array(_data_set['Day']))
__day_list = {}
for i in day_list:
    __day_list.update({i: len(np.where(np.array(_data_set['Day'] == i)[0]))})

In [None]:
print(__year_list)
print(__month_list)
print(__day_list)

* The year 1989 and 2020 are not complete
* Month # 6 (June) does not have any recorded DGSR value

## Getting complete years

In [None]:
data_set = _data_set[(_data_set['Year'] >= 1990) & (_data_set['Year'] <= 2019)]

## Getting a look at Estimated DGSR data

In [None]:
data_set['Estimated DGSR'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.99]).T

In [None]:
# distribution of DGSR value and checking for potential outliers

plt.figure(figsize=(8,6))

plt.subplot(211)
data_set['Estimated DGSR'].plot.hist(bins=14, ec='k', fc='w')
lims = plt.gca().get_xlim()
plt.xlabel('Estimated DGSR')
plt.ylabel('Count')
plt.twinx()
_ = data_set['Estimated DGSR'].plot(kind='kde')
plt.title('Distribuiton of Estimated DGSR')
plt.xlim(lims)

plt.subplot(212)
data_set['Estimated DGSR'].plot.box(vert=False)
plt.grid('on')
plt.tight_layout()

There is no outlier in Esimated DGSR value.

# Boxlplots

## Yearly

In [None]:
# yearly boxplot for DGSR
_ = pd.plotting.boxplot(data_set.drop(['Month', 'Day'], axis=1), by='Year', figsize=(16,5))

## Monthly

In [None]:
# monthly boxplot for DGSR
_ = pd.plotting.boxplot(data_set.drop(['Year', 'Day'], axis=1), by='Month', figsize=(16,5))

## Daily

In [None]:
# daily boxplot for DGSR
_ = pd.plotting.boxplot(data_set.drop(['Month', 'Year'], axis=1), by='Day', figsize=(16,5))

There are some major outliers in the months of April/May/July/August and November/December. We'll come back to these ones later on.

# Scatter plots

## Yearly mean

In [None]:
# yearly mean

gby_year = add_mean_to_data(data_set, 'Year')


_ = gby_year.plot.scatter('Year', 'Mean', label='Yearly mean [Estimated DGSR]', figsize=(15, 4), grid='on')

#mult_degree(gby_year.Year, gby_year.Mean, 6, 'A', 'r')

Data is quite irregular. Also, a 6th order polynomial is poorly conditioned so the shape cannot be determined by complex polynomial functions.

## Monthly mean

In [None]:
# monthly mean
gby_month = add_mean_to_data(data_set, 'Month')

_ = gby_month.plot.scatter('Month', 'Mean', label='Monthly mean [Estimated DGSR]', figsize=(15, 4), grid='on')
_ = plt.legend(loc='upper center')

x, y = gby_month.Month, gby_month.Mean

t = mult_degree(x, y, 2, '2nd order polynomial', 'r')

### Determining the missing average value for the month of June
Data is quite well fitted by a 2nd order polynomial so we can try and determine the average value for the month of June as well.

In [None]:
# Estimate the value for June
x_new = np.linspace(x.min(), x.max(), 500)
print('The number of data points are {} and the number of fitted points are {}.'.format(len(x_new), len(t)))

mean_value_for_june = t[np.where(x_new >= 6)[0][0]]
print('The mean Estimated DGSR value for the month of June is {}.'.format(round(mean_value_for_june, 2)))

## Daily mean

In [None]:
# daily mean
gby_day = add_mean_to_data(data_set, 'Day')

_ = gby_day.plot.scatter('Day', 'Mean', label='Daily mean [Estimated DGSR]', figsize=(15, 4), grid='on')

# just for fun, let's try and fit the daily mean data using polynomials

x, y = gby_day.Day, gby_day.Mean

_ = mult_degree(x, y, 8, '8th order poly_fit', 'r')
_ = mult_degree(x, y, 10, '10th order poly_fit', 'g')
_ = mult_degree(x, y, 12, '12th order poly_fit', 'b')
_ = mult_degree(x, y, 14, '14th order poly_fit', 'm')
_ = mult_degree(x, y, 18, '18th order poly_fit', 'orange')

It can be seen that the 14th and 18th order polynomials do fit the shape of the curve better, but they overestimate the fit in the beginning and the end.

The 10th and 12th order polynomials, on the other hand, are quite okay as compared to the other two higher order polynomials.

## Taking a sneak peak at months with outliers

In [None]:
apr, may, jul = get_monthly_data(data_set, 4), get_monthly_data(data_set, 5), get_monthly_data(data_set, 7)
aug, nov, dec = get_monthly_data(data_set, 8), get_monthly_data(data_set, 11), get_monthly_data(data_set, 12)

x = np.linspace(1, 20, 50)

y_apr, y_may, y_jul = get_norm(x, apr), get_norm(x, may), get_norm(x, jul)
y_aug, y_nov, y_dec = get_norm(x, aug), get_norm(x, nov), get_norm(x, dec)

labels = ['apr', 'may', 'jul', 'aug', 'nov', 'dec']
to_plot = [y_apr, y_may, y_jul, y_aug, y_nov, y_dec]

plt.figure(figsize=(10, 6))
plt.suptitle('scipy.norm.pdf distributions of months with outliers')

for i in list(range(0, 6)):
    plt.subplot(int(str('23%s' %(i+1))))
    plt.plot(x, to_plot[i], label=labels[i])
    plt.legend(loc='best')
    plt.tight_layout()