In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn.linear_model import LogisticRegression

np.random.seed(42)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Rain in Australia

- Author: **Antoni KÄ™dzierski**
- Method: **Logistic Regression**
- Date: **15.03.2021**

# 1. Load and prepare data

In [None]:
# Load data
rainfall = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")

In [None]:
# Get info about all columns
rainfall.info()

In [None]:
# Count NaNs for each variables:
rainfall.isna().sum()

In [None]:
# We should remove rows without RainTomorrow (y) value
rainfall = rainfall[rainfall.RainTomorrow.notna()]
rainfall.isna().sum()

In [None]:
# NaN columns may vary on location
rainfall.groupby("Location").agg(lambda x: x.isna().sum()).reset_index()

In [None]:
# Format data as Datetime.Datetime object
rainfall.Date = pd.to_datetime(rainfall.Date)

In [None]:
# Convert dates to seasons (southern hemisphere)
fall = range(80, 172)
winter = range(172, 264)
spring = range(264, 355)
rainfall["Spring"] = rainfall.Date.dt.dayofyear.apply(lambda x: 1 if x in spring else 0)
rainfall["Winter"] = rainfall.Date.dt.dayofyear.apply(lambda x: 1 if x in winter else 0)
rainfall["Fall"] = rainfall.Date.dt.dayofyear.apply(lambda x: 1 if x in fall else 0)

As can be spot, different values are measured along the cities. We should prepare separate models for every city, with variables adequate to what was measured.

We assume that a variable is worth considering if 90% of records contains it.

In [None]:
# Count observations for each city
n_obs_by_city = rainfall.groupby("Location").agg('count').reset_index()
n_obs_by_city.iloc[:, 2:] = n_obs_by_city.iloc[:, 2:].div(n_obs_by_city.Date, axis=0)

# Get columns names for each city
columns_by_city = {}
for city in n_obs_by_city.Location:
    columns = n_obs_by_city.iloc[:, 2:].loc[n_obs_by_city.Location == city, :].gt(0.9).any()
    columns_by_city[city] = list(columns[columns].index)
    
    # We always want to have 'RainToday' variable in model
    if "RainToday" not in columns_by_city[city]:
        columns_by_city[city].append("RainToday")

In [None]:
# Convert "Yes" and "No" into 0-1
rainfall.RainToday = rainfall.RainToday.apply(lambda x: 0 if x == "No" else 1)
rainfall.RainTomorrow = rainfall.RainTomorrow.apply(lambda x: 0 if x == "No" else 1)

In [None]:
# As we have some quality data columns, we should encode them as 0-1 vectors or try another methods.
# Separate numerical data from the rest:
numerical_variables = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", 
                       "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3am", "Pressure9am", 
                       "Pressure3am", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm"]

In [None]:
# Create object containing all data:
class RainfallDataObject:
    def __init__(self, data, columns_by_city, split_ratio=0.25):
        self.columns_by_city = columns_by_city
        self.cities = list(columns_by_city.keys())
        self.data_by_city = {}
        self.train_indices = {}
        self.test_indices = {}
        self.dropdata = ["Date", "Location", "WindGustDir", "WindDir9am", "WindDir3pm", "RainTomorrow"]
        for city in self.cities:
            # Specifiy columns
            columns = ["Date", "Location"] + columns_by_city[city]
            self.data_by_city[city] = data.loc[data.Location == city, columns].dropna()
            
            # Prepare train & test sets
            indices = np.linspace(0, self.data_by_city[city].shape[0], num=self.data_by_city[city].shape[0], endpoint=False)
            np.random.shuffle(indices)
            self.test_indices[city] = indices[:int(0.25 * self.data_by_city[city].shape[0])]
            self.train_indices[city] = indices[int(0.25 * self.data_by_city[city].shape[0]):]
            
            # Add difference variables
            if "Humidity9am" in self.columns_by_city[city] and "Humidity3pm" in self.columns_by_city[city]:
                self.data_by_city[city]["HumidityDiff"] = self.data_by_city[city]["Humidity3pm"] - self.data_by_city[city]["Humidity9am"]
                columns.append("HumidityDiff")
            
            if "Pressure9am" in self.columns_by_city[city] and "Pressure3am" in self.columns_by_city[city]:
                self.data_by_city[city]["PressureDiff"] = self.data_by_city[city]["Pressure3am"] - self.data_by_city[city]["Pressure9am"]
                columns.append("PressureDiff")
                
            if "Cloud9am" in self.columns_by_city[city] and "Cloud3pm" in self.columns_by_city[city]:
                self.data_by_city[city]["CloudDiff"] = self.data_by_city[city]["Cloud3pm"] - self.data_by_city[city]["Cloud9am"]
                columns.append("CloudDiff")
            
            if "Temp9am" in self.columns_by_city[city] and "Temp3pm" in self.columns_by_city[city]:
                self.data_by_city[city]["TempDiff"] = self.data_by_city[city]["Temp3pm"] - self.data_by_city[city]["Temp9am"]
                columns.append("TempDiff")
                
            # Convert wind to 2D vector using (cos(x), sin(x)) representation
            wind_angle = {'E': 0, 'ENE': 22.5, 'NE': 45, 'NNE': 67.5,
                          'N': 90, 'NNW': 112.5, 'NW': 135, 'WNW': 157.5,
                          'W': 180, 'WSW': 202.5, 'SW': 225, 'SSW': 247.5,
                          'S': 270, 'SSE': 292.5, 'SE': 315, 'ESE': 337.5}
            if "WindGustDir" in self.columns_by_city[city] and "WindGustSpeed" in self.columns_by_city[city]:
                self.data_by_city[city]["WindGustSpeedX"] = np.cos(np.deg2rad(self.data_by_city[city].WindGustDir.apply(lambda x: wind_angle[x]))) * self.data_by_city[city].WindGustSpeed
                self.data_by_city[city]["WindGustSpeedY"] = np.sin(np.deg2rad(self.data_by_city[city].WindGustDir.apply(lambda x: wind_angle[x]))) * self.data_by_city[city].WindGustSpeed
                columns += ["WindGustSpeedX", "WindGustSpeedY"]
            
            if "WindDir9am" in self.columns_by_city[city] and "WindSpeed9am" in self.columns_by_city[city]:
                self.data_by_city[city]["WindSpeedX9am"] = np.cos(np.deg2rad(self.data_by_city[city].WindDir9am.apply(lambda x: wind_angle[x]))) * self.data_by_city[city].WindSpeed9am
                self.data_by_city[city]["WindSpeedY9am"] = np.sin(np.deg2rad(self.data_by_city[city].WindDir9am.apply(lambda x: wind_angle[x]))) * self.data_by_city[city].WindSpeed9am
                columns += ["WindSpeedX9am", "WindSpeedY9am"]
                
            if "WindDir3pm" in self.columns_by_city[city] and "WindSpeed3pm" in self.columns_by_city[city]:
                self.data_by_city[city]["WindSpeedX3pm"] = np.cos(np.deg2rad(self.data_by_city[city].WindDir3pm.apply(lambda x: wind_angle[x]))) * self.data_by_city[city].WindSpeed3pm
                self.data_by_city[city]["WindSpeedY3pm"] = np.sin(np.deg2rad(self.data_by_city[city].WindDir3pm.apply(lambda x: wind_angle[x]))) * self.data_by_city[city].WindSpeed3pm
                columns += ["WindSpeedX3pm", "WindSpeedY3pm"]            
            
                
        
    def get_city_data(self, city):
        return self.data_by_city[city]
    
    def get_city_varnames(self, city):
        return self.data_by_city[city].columns.tolist()
    
    def train_set(self, city):
        droplist = list(set(self.dropdata) & set(self.data_by_city[city].columns))
        return self.data_by_city[city].iloc[self.train_indices[city], :].drop(droplist, axis=1, inplace=False), self.data_by_city[city].RainTomorrow.iloc[self.train_indices[city]]
    
    def test_set(self, city):
        droplist = list(set(self.dropdata) & set(self.data_by_city[city].columns))
        return self.data_by_city[city].iloc[self.test_indices[city], :].drop(droplist, axis=1, inplace=False), self.data_by_city[city].RainTomorrow.iloc[self.test_indices[city]]
    
    def nrow(self, city):
        return self.data_by_city[city].shape[0]
    
    def ncol(self, city):
        return self.data_by_city[city].shape[1]
    
    def nrow_train(self, city):
        return len(self.train_indices[city])
    
    def nrow_test(self, city):
        return len(self.test_indices[city])  

In [None]:
# Create data object
rainfallData = RainfallDataObject(rainfall, columns_by_city)

# 2. Train models for each city

In [None]:
def accuracy(pred, real):
    return sum(pred == real) / real.shape[0]

In [None]:
# Prepare models and accuracy table
models = {}
acc = {}

# Train model for each city
for city in n_obs_by_city.Location:
    X_train, y_train = rainfallData.train_set(city)
    X_test, y_test = rainfallData.test_set(city)
    models[city] = LogisticRegression(max_iter=10000)
    models[city].fit(X_train, y_train)
    pred = models[city].predict(X_test)
    acc[city] = accuracy(pred, y_test)

In [None]:
print(f"Average accuracy is: {np.mean(list(acc.values()))}")