In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Let's read the data first. I only keep the rows where the y-label (i.e. RainTomorrow) is there. I just have to discard 3267 rows for that, which is just approximately 2% of the data.

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
print(df.RainTomorrow.value_counts(dropna=False))
print(3267./len(df))
df = df[~df['RainTomorrow'].isnull()]
print(len(df))

# Let's see some rows of the data.

In [None]:
df.head()

# Let's see the datatypes and number of non-null values in the data.

In [None]:
df.info()

# Above we don't get the proper understanding of the missing values. Let's visualize them using the library missingno.

In [None]:
import missingno
%matplotlib inline

if df.isnull().any(axis=None):
    missingno.matrix(df, color=(0/255., 0/255., 0/255.))

# We see above that a large amount of values are missing in the column Evaporation, Sunshine, Colud9am and Cloud3pm.
# Let's go deeper and see what is effect of these null values on the whole data and how it would affect our predictions. For this, we plot dendrogram for the data (based on heirarchial clustering).

In [None]:
missingno.dendrogram(df)

# In the dendrogram, we see the columns divided into two sets. One set is of the columns, we saw had the highest number of missing values and other set which is more complete.  
# The nearer the leaf, the lesser the number of missing values in the column. We see RainTomorrow, RainToday, Rainfall, Date and Location at zero which means they have 0 missing values. The closer the branch to zero, the lesser missing values they have.
# The most intact set of columns for predictions by observation is (Humidity 9 am, Wind Speed 9am, Temp 9 am, MinTemp, MaxTemp, Rainfall, Date, RainTomorrow, RainToday). For any other column, we'll have to fill more data.

# Let's also check if there are any duplicate rows in our data.

In [None]:
if len(df[df.duplicated()]) > 0:
    print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
    display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
else:
    print("\nNo duplicated entries found")

# There are **NO** duplicate entries found.
# Now let's see the top 5 unique values for our non-numerical data. It also shows us the missing value problem is quite pronounced in WindGustDir and WindDir9am.

In [None]:
def top5(df):
    columns = df.select_dtypes(include=['object', 'category']).columns
    for col in columns:
        print("Top 5 unique values of " + col)
        print(df[col].value_counts(dropna=False).reset_index().rename(columns={"index": col, col: "Count"})[
              :min(5, len(df[col].value_counts()))])
        print(" ")

top5(df)

# Now, let's see the distribution for all the data columns we have. We'll go according to the order of columns set by Pandas.

In [None]:
import seaborn as sns

fig = sns.catplot(x=df.columns[0], kind='count', data=df, height=6, order=df[df.columns[0]].value_counts().index)
fig.set(xticklabels=[], title='Countplot for Date')

# We see that there are multiple values for each date. The reason is that for each date, the data from multiple locations is there. 
# It also shows us for **some dates we don't have the data from all locations**. This fact is important to consider while creating a model for the data.

In [None]:
import seaborn as sns

fig = sns.catplot(x=df.columns[1], kind='count', data=df, height=6, aspect=2, order=df[df.columns[1]].value_counts().index)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for Location')

# We see in the above graph, that the data for locations Nhil, Katherine and Uluru is quite less as compared to other locations. 
# As rainfall is a local phenomenon. If we train a model with this data, the model will not be robust enough to predict rainfall for these locations. 
# One has to take that into account. This again is important while thinking about the model even more than the date issue.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x=df.columns[2], data=df, color='red', kde=True, height=6, kind='hist')
fig.set(title='Histogram for Min Temp')

# By observing the values for MinTemp, we see that the distribution for it is right-skewed normal. This means it would be a good idea to standardize it while preprocessing for model. But, what if it was not? Then, you we can use **Box-Cox transform** to convert our data to fit a normal distribution.
# Also, let's check out the effect of doing Box-Cox transform on MinTemp.

In [None]:
from scipy import stats

xt, _ = stats.boxcox(df['MinTemp'][df['MinTemp'] > 0])

fig = sns.displot(x=xt, color='green', kde=True, height=6, kind='hist')
fig.set(title='Histogram for Min Temp')

# We see that right-skew is totally gone.
# We saw above the statistics over the whole dataset and its Box-Cox transform. To look at the local signal, we can use Location column. Let's plot the same data but conditioned on the Location.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x=df.columns[2], color=df['Location'], marginal='box')
fig.show()

# Look at **each Location separately** to get an idea of the local distribution. *Double click on a location toggles the plot to show just that one location to showing everything. So, after double-clicking on one location, to see other location, you have to do the double-click two times on the new location.*
# One would observe that the global distribution right-skewed (which we solved by the Box-Cox transform).
# But, here we observe that at the local level, there are all kind of beasts (left-skewed, symmetric and right-skewed). We also get a visual proof of central limit theorem because the global statistic follows the normal distribution (though we are not sampling the means) while the local statistic runs helter-skelter.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
sns.displot(x=df.columns[2], data=df, color='blue', kde=True, height=6, kind='hist')
# sns.kdeplot(x=df.columns[2], data=df, color='orange')
# fig.set(title='Histogram for Max Temp')
# sns.despine()

# By observing the values for MaxTemp, we see that the distribution for it is right-skewed normal. This means it would be a good idea to standardize it while preprocessing for model. Let's also see the plot conditioned on Location.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x=df.columns[2], color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# Here, again we see that local statistics run helter-skelter but globally, there is a right skew just like MinTemp.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x=df.columns[3], data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Rainfall')

# Usually rainfall follows a [Tweedie distribution](https://rmets.onlinelibrary.wiley.com/doi/10.1002/joc.2162). But, here we see it is again a right skewed normal curve. We can again apply Box-Cox transform to fix this while feature pre-processing.
# Let's also take a look at rainfall data conditioned on Location.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x=df.columns[3], color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# In case of rainfall again, we see the right-skewed normal on the global level. On the other hand, at local level, we see all left-skewed, right-skewed and symmetric normals. It would be a good idea to apply Box-Cox transform here also. But at local-level or global-level or it doesn't matter, that's again a question to ponder upon? I leave it up to the reader.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
# fig = sns.displot(x=df.columns[4], data=df, color=(0.0, 0.0, 0.0), kde=True, height=6, kind='box')
# fig.set(ylim=(0,400))
fig = sns.catplot(x=df.columns[4], data=df, kind="boxen", height=6, aspect=2, palette='Set3')
fig.set(xticks=list(range(0, 410, 10)))
fig.set(title='Enhanced Boxplot for Evaporation')

# The above enhanced boxplot shows us that most of value for Evaporation lie in range 0-10 and as values get higher their number gets lower. An exponentially decreasing curve is what you get here. 
# It also has outliers which are too far off from the distribution which also points to the fact that maybe evaporation values were not recorded accurately. But, this also cannot be completely true since a lot of data for evaporation is missing (~44%).
# One has to think about replacing the NaN value with mean (or any other central tendency) or completely dropping this feature during feature selection.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x=df.columns[4], color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10, range_y=[0, 400])
fig.show()

# We looked at the global statistic for Evaporation. Here, at the local level, the distribution doesn't change a lot. All of the curves are completely right-skewed.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Sunshine', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Sunshine')

# We finally get a Tweedie distribution. See the spike at the first bar in the histogram, that's how you recognize one. Even if we try to do Box-Cox here, it won't bring it back.
# What to do in this case? One has to think about replacing the NaN value with mean (or any other central tendency) or completely dropping this feature during feature selection (has 48% missing data).

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Sunshine', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10, range_y=[0, 400])
fig.show()

# Because of missing data, not all locations have the data for Sunshine. But, when they have the data has the values concentrated at zero. This gives the final rise to the Tweedie distribution we saw.

In [None]:
import seaborn as sns

fig = sns.catplot(x='WindGustDir', kind='count', data=df, height=6, aspect=2, order=df['WindGustDir'].value_counts().index)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for WindGustDir')

# WindGustDir can be easily incorporated into the final model because it is not imbalanced and only has ~7% data missing. But, rather than label encoding (which would turn them into numbers and act as a problem for the model), it would be better to one-hot encode them since there aren't many unique values for this variable.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='WindGustSpeed', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for WindGustSpeed')

# WindGustSpeed has like WindGustDir only ~7% data missing. So, the spaces between the bars we see in histogram tell us that WindGustSpeed takes discrete value. This can be a limitation of the measuring device also. Also the curve conforms to normal structure with some amount of right skewness. Again, Box-Cox tranform can be applied.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='WindGustSpeed', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10, range_y=[0, 400])
fig.show()

# Let's see the WindGustSpeed at the local level also. We see that most of the distribution are right skewed only. This is unlike above variables where all three types of skewness were occuring. This variable is well-behaved.

In [None]:
import seaborn as sns

fig = sns.catplot(x='WindGustSpeed', kind='count', data=df, height=6, aspect=2, order=df['WindGustSpeed'].value_counts().index)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for WindGustSpeed')

# As we saw above that WindGustSpeed takes discrete values. After further analysis, we can see it only takes 67 different values. This also gives us the option of treating this variable as categorical. One may choose to bin these value to reduce the number of categories.

In [None]:
import seaborn as sns

fig = sns.catplot(x='WindDir9am', kind='count', data=df, height=6, aspect=2, order=df['WindDir9am'].value_counts().index)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for WindDir9am')

# With ~7% missing data and insignificant imbalance of data, this variable is well-behaved to take into consideration for the final model. 

In [None]:
import seaborn as sns

fig = sns.catplot(x='WindDir3pm', kind='count', data=df, height=6, aspect=2, order=df['WindDir3pm'].value_counts().index)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for WindDir3pm')

# With ~2% missing data and insignificant imbalance of data, this variable is well-behaved to take into consideration for the final model. 

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Humidity9am', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Humidity9am')

# We again get a Tweedie distribution. This builds a strong case about using a technique that is robust to variables not belonging to the normal distribution. Also, given this variable has only ~1% missing data, this variable is important to include in the model.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Humidity9am', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# Looking the variable Humidity9am at the local level, we see the left-skew over all the locations along with the point mass being at 100. This variable is well-behaved. 

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Humidity3pm', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Humidity3pm')

# This variable is well-behaved since it conforms to the normal distribution. Also, since this variable has only ~2.5% data missing, this variable should be included in the final features for the model.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Humidity3pm', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# Looking at this variable under the local level, we see that it is well-behaved over all locations. Thus, a good variable for the model.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Pressure9am', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Pressure9am')

# The variable Pressure9am has a left-skewed normal distribution. Also, we can observe that there are clearly 6 groups emerging in the histogram. Thus, we can also do binning and then, treat this variable as a categorical variable. With ~10% data missing, this variable should be a candidate for the final model.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Pressure9am', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# Even at the local level, the distribution are well-behaved. All are left-skewed. This variable will be a good addition for any kind of model.

In [None]:
import seaborn as sns

fig = sns.catplot(x='Pressure9am', kind='count', data=df, height=6, aspect=2)
fig.set_xticklabels([])
fig.set(title='Countplot for Pressure9am')

# To further build the case for using Pressure9am as a categorical variable, we see from its countplot that we can easily create bins.
# Binning is always a good idea for models that can have different values coming towards them in the future. It makes them robust towards data that the model hasn't seen.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Pressure3pm', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Pressure3pm')

# In the case of Pressure3pm variable, we again see a normal distribution with left-skew. With only 10% data, this is a good candidate for the model. Here, we don't see clear bins like in Pressure9am variable. Let's see its countplot to eliminate the possibility of this variable being used a categorical variable.

In [None]:
import seaborn as sns

fig = sns.catplot(x='Pressure3pm', kind='count', data=df, height=6, aspect=2)
fig.set_xticklabels([])
fig.set(title='Countplot for Pressure3pm')

# We see that, in this case also we can create bins and treat this variable as a categorical variable. Let's finally check out its local behaviour.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Pressure3pm', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# We also confirm the behaviour of the variable at the local level. The variable is well-behaved over all the locations.

In [None]:
import seaborn as sns

fig = sns.catplot(x='Cloud9am', kind='count', data=df, height=6, aspect=2)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for Cloud9am')

# Given the discrete values, the variable Cloud9am takes, it will be good idea to use this a  categorical variable. But, with ~37% data missing, the decision to whether drop this variable is to be considered. 
# Or the missing values can be handled by adding a 'Missing' category in this variable (when the variable is treated as categorical). But, that will also create a class imbalance.
# Also, imbalance in there in the variable. This again makes a case to not use it.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Cloud9am', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# Even at the local level, we observe there is imbalance in this variable.

In [None]:
import seaborn as sns

fig = sns.catplot(x='Cloud3pm', kind='count', data=df, height=6, aspect=2)
fig.set_xticklabels(rotation=90)
fig.set(title='Countplot for Cloud3pm')

# Given the discrete values, the variable Cloud3pm takes, it will be good idea to use this a  categorical variable. But, with ~40% data missing, the decision to whether drop this variable is to be considered. 
# Or the missing values can be handled by adding a 'Missing' category in this variable (when the variable is treated as categorical). But, that again, will introduce a major class imbalance.
# Also, imbalance in there in the variable. This again makes a case to not use it.

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Cloud3pm', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# Even at the local level, we observe there is imbalance in this variable.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Temp9am', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Temp9am')

# This variable conforms to the normal distribution with a right skew. With ~37% missing data, this variable again should be taken with a pinch of salt. Or a model such as decision tree should be used which can handle missing values.
# One other thing to consider is whether this variable can be used a categorical variable (since that would make it robust to the future values). Let's see its countplot to get a better view. 

In [None]:
import seaborn as sns

fig = sns.catplot(x='Temp9am', kind='count', data=df, height=6, aspect=2)
fig.set_xticklabels([])
fig.set(title='Countplot for Temp9am')

# We see that we can do binning and use it a categorical variable. We can have three categories (Low, Medium, High) or just two categories (Low, High). And one other to handle missing values (we can set the threshold for the categories such that finally there would be no imbalance).

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Temp9am', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# At the local level, we see more right-skewed distributions than left-skewed distribution, thus, pushing the global statistic towards a right-skewed distribution. But, the changes are not too drastic locally. This variable is behaved enough to be used in the final model.

In [None]:
import seaborn as sns

# fig = sns.catplot(x=df.columns[2], kind='count', data=df, height=6, order=df[df.columns[2]].value_counts().index)
# fig.set_xticklabels(rotation=90)
fig = sns.displot(x='Temp3pm', data=df, color=(0.2, 0.5, 0.6, 0.5), kde=True, height=6, kind='hist')
fig.set(title='Histogram for Temp3pm')

# This variable conforms to the normal distribution with a right skew. With ~40% missing data, this variable again should be taken with a pinch of salt. Or a model such as decision tree should be used which can handle missing values.
# One other thing to consider is whether this variable can be used a categorical variable (since that would make it robust to the future values). Let's see its countplot to get a better view. 

In [None]:
import seaborn as sns

fig = sns.catplot(x='Temp3pm', kind='count', data=df, height=6, aspect=2)
fig.set_xticklabels([])
fig.set(title='Countplot for Temp3pm')

# We see that we can do binning and use it a categorical variable. We can have three categories (Low, Medium, High) or just two categories (Low, High). And one other to handle missing values (we can set the threshold for the categories such that finally there would be no imbalance).

In [None]:
import plotly.express as px
# df = px.data.tips()
fig = px.histogram(df, x='Temp3pm', color=df['Location'], marginal='box', \
                   color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

# At the local level, we see right-skewed distributions mostly. This variable is well-behaved than Temp9am. This feature should be included (once missing values are handled properly).

In [None]:
import seaborn as sns

fig = sns.catplot(x='RainToday', kind='count', data=df, height=6, aspect=2)
fig.set(title='Countplot for RainToday')

# For the variable RainToday, we see a huge imbalance. But, then rainfall is indeed an anomaly.
# Now, the question which arises is that whether this variable be included in the final model? The variable to be predicted is RainfallTomorrow. Let's see the correlation between both.

In [None]:
import numpy as np

temp = df
temp['RainTomorrow'] = temp['RainTomorrow'].astype('category').cat.codes
temp['RainToday'] = temp['RainToday'].astype('category').cat.codes
sns.set(rc={'figure.figsize':(8,6)})
sns.heatmap(temp.corr(), mask=np.triu(temp.corr()))

# We see there is a positive correlation between RainToday and RainTomorrow. But, it can still be dropped since the correlation is not high enough.

In [None]:
import seaborn as sns

fig = sns.catplot(x='RainTomorrow', kind='count', data=df, height=6, aspect=2)
fig.set(title='Countplot for RainTomorrow')

# We finally reach our goal i.e. RainTomorrow. This is the variable we have to predict.
# The first thing we observe is that there is a huge imbalance between both the classes. Rainfall indeed is an anomaly.
# In this case, what should be the metric then? Accuracy is a big NO since, we'll get higher values of accuracy even when our model will predict no rain for the whole test data. In this case, precision, recall and F1-score would be a great fit. Even the popular ROC curve will not be a good fit since it treats both positive and negative class equally. Instead of ROC, we can use Average Precision Recall as a metric.