In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import seaborn as sns
import matplotlib
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline

pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',None)
import warnings
warnings.filterwarnings('ignore')
import missingno as msno

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df = data.copy()
df.head()

# This function returns the first n rows for the object based on position. 
# It is useful for quickly testing if your object has the right type of data in it.
# n = int, default 5

In [None]:
df.info()

# This method prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.

In [None]:
df.describe(include = 'all').T

# include = 'all' : All columns of the input will be included in the output.

# MissingNo visualizations

Pandas provides functions to check the number of missing values in the dataset. Missingno library takes it one step further and provides the distribution of missing values in the dataset by informative visualizations. Using the plots of missingno, we are able to see where the missing values are located in each column and if there is a correlation between missing values of different columns.

In [None]:
miss = df.isnull().sum().sort_values(ascending = False)
miss_per = (miss/len(df))*100
pd.DataFrame({'MissingNo_records': miss, 'percentage of missing data': miss_per.values})

In [None]:
msno.matrix(df);

# White lines indicate missing values.

In [None]:
msno.bar(df)

In [None]:
df = df.dropna(subset = ['RainTomorrow', 'RainToday']) 
# df.dropna is the canonical method to drop NaNs from DataFrames.
msno.bar(df)

# Frequency of categorical variables with histogram

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

# Convert argument to datetime.

df['Year'] = df['Date'].dt.year.astype('int16')
df['Month'] = df['Date'].dt.month.astype('int16')
df['Year_and_Month'] = df['Date'].array.strftime('%Y-%m')
df['Different_Temp'] = df['MaxTemp']-df['MinTemp']
df.head()

In [None]:
a = 0
def figure():
    global a
    a += 1
    return 'Tables and Graphs_'+str(a)

In [None]:
categorical_variables = [i for i in df.columns if df[i].dtype == 'object']

In [None]:
for i in categorical_variables[:-1]:
    Tables_Graphs = figure()
    Tables_Graphs = px.histogram(df, x = i,title = f"Tables and Graphs_{a}: Frequency of {i}")
    Tables_Graphs.update_xaxes(categoryorder = 'total ascending')
    Tables_Graphs.show()

# Distribution of numerical variables with histogram and boxplot/violin

In [None]:
Tables_Graphs_ = figure()
Tables_Graphs = px.histogram(df, x='Year_and_Month', color = 'RainToday',
                    title = f"Tables_Graphs_{a}: Rainy Day Distribution in Year_and_Month Time Series")
Tables_Graphs.show()

In [None]:
num_variables = [i for i in df.columns if df[i].dtype!='object']

In [None]:
cls_1=[
 'Different_Temp',
 'MaxTemp',
 'MinTemp',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
]

In [None]:
for i in cls_1:
    Tables_Graphs = figure()
    Tables_Graphs = px.violin(df, x = i, color='RainTomorrow',title = f"Tables_Graphs_{a}: Violinplot of {i}")
    Tables_Graphs.show()

    Tables_Graphs = figure()
    Tables_Graphs = px.histogram(df, x = i, color='RainTomorrow',title = f"Tables_Graphs_{a}: Frequency Distribution of {i}")
    Tables_Graphs.show()

# Heatmap for Correlation

In [None]:
msno.heatmap(df);

- The correlations between Temp3pm-Humidity3pm(0.9), Cloud3pm-Cloud9am(0.8), Windspeed3pm-WindDir3pm(0.8) and Sunshine-Evaporation(0.8) are high.

# Scatter matrix and/or OLS Scatter Plot for numerical variables
- Additionally RainToday and RainTomorrow variables as color dimension

In [None]:
cls_1=[
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'Different_Temp',
 'Year',
 'Month']

In [None]:
df_0 = df.drop(['Date'],axis = 1).groupby(['Year_and_Month','RainTomorrow'])[cls_1].mean().reset_index()

In [None]:
 cls_2=[
         ('MinTemp', 'MaxTemp'),
         ('Different_Temp', 'Evaporation'),
         ('Different_Temp', 'Sunshine'),
         ('Different_Temp', 'Humidity9am'),
         ('Different_Temp', 'Pressure9am'),
         ('MaxTemp', 'Rainfall'),
         ('Rainfall', 'Evaporation'),
         ('Rainfall', 'Sunshine'),
         ('Rainfall', 'WindSpeed9am'),
         ('Rainfall', 'Humidity9am'),
         ('Rainfall', 'Pressure9am')]

In [None]:
for pair in cls_2:
    Tables_Graphs = figure()
    Tables_Graphs = px.scatter(df_0, x = pair[0], y = pair[1],title = f"fig_{a}: Scatterplot of {pair}",
                     trendline = "ols",
                     color = 'RainTomorrow',
                     trendline_color_override="green")
    Tables_Graphs.show()

# Time series (preferably monthly) line chart for numerical variables
- RainToday and RainTomorrow variables as 3rd variable with color dimension
- Tip: Use groupby method to group by month/year and calculate mean values

In [None]:
cls_1 = ['MinTemp', 'MaxTemp', 'Different_Temp','Rainfall', 'Evaporation',
       'Sunshine','WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm']

In [None]:
df1 = df_0.groupby(['Year_and_Month','RainTomorrow'])[cls_1].agg('mean').reset_index()
df2 = df_0.groupby(['Month','RainTomorrow'])[cls_1].agg('mean').reset_index()
df3 = df_0.groupby(['Year','RainTomorrow'])[cls_1].agg('mean').reset_index()

In [None]:
for i in cls_1:
    Tables_Graphs = figure()
    Tables_Graphs = px.line(df1, x='Year_and_Month', y = i, title = f"Tables_Graphs_{a}: Lineplot of {i}",
                     color='RainTomorrow',
                 )
    Tables_Graphs.show()

In [None]:
for i in cls_1:
    Tables_Graphs = figure()
    Tables_Graphs = px.line(df2, x = 'Month', y = i, title = f"Tables_Graphs_{a}: Lineplot of {i}",
                     color = 'RainTomorrow',
                 )
    Tables_Graphs.show()

In [None]:
for i in cls_1:
    Tables_Graphs = figure()
    Tables_Graphs = px.line(df3, x = 'Year', y = i, title = f"Tables_Graphs_{a}: Lineplot of {i}",
                     color='RainTomorrow',
                 )
    Tables_Graphs.show()

Tables_Graphs

# Polar/Radar chart for numerical variables and Wind Direction as the 2nd variable
- Tip: Use groupby method and calculate mean values

In [None]:
df4 = df[df.RainTomorrow == 'Yes'].groupby('WindGustDir')[cls_1].agg('mean').reset_index()
df5 = df[df.RainTomorrow == 'No'].groupby('WindGustDir')[cls_1].agg('mean').reset_index()

In [None]:
import plotly.graph_objects as go

categories = df4.WindGustDir.unique()

for i in cls_1:
    Tables_Graphs = figure()
    Tables_Graphs = go.Figure()

    Tables_Graphs.add_trace(go.Scatterpolar(
          r = df4[i],
          theta = categories,
          fill = 'toself',
          name = 'Product A'
    ))
    Tables_Graphs.add_trace(go.Scatterpolar(
          r = df5[i],
          theta = categories,
          fill = 'toself',
          name = 'Product B'
    ))

    Tables_Graphs.update_layout(
      polar = dict(
        radialaxis = dict(
          visible = True,
          range = [min(df4[i].min(),df5[i].min()),
                 max(df4[i].max(),df5[i].max())]
        )),
      title = {
        'text': f"Tables_Graphs_{a}: Radarplot of {i}",
        'y':0.9,'x':0.5,
        'xanchor': 'center','yanchor': 'top'},
      showlegend = False
    )

    Tables_Graphs.show()