In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

In [None]:
# A function that outputs information about the data
def df_info(df, name=None, flag = False):
    if name is not None:
        print(f'df = {name}')
        print('--------------------')
    
    print(f'size df : {df.shape[0]} x {df.shape[1]} ')
    print('--------------------')
        
    if df.isnull().sum().sum() == 0:
        print('not missing values!')
    else:
        print('Missing values:')
        counter = 0
        for col in df.columns:
            if df[col].isna().sum():
                print(col,'TYPE: ', df[col].dtypes, 'MISSING VALUES: ', df[col].isna().sum(),
                     f'({df[col].isna().sum()/df.shape[0]:.2%})')
                if df[col].isna().sum()/df.shape[0] > 0.2:
                    counter += 1
            else:
                print(col,'TYPE: ', df[col].dtypes, 'MISSING VALUES: ', df[col].isna().sum(),)
        if counter > 0:
            print('--------------------')
            print(f'Missing values >20% in {counter} features')
    if flag:
        print('--------------------')
        print(df.info())
        print('--------------------')
        print(df.describe())

In [None]:
df_info(df, 'Rain in Australian')

# 1.Data processing

In [None]:
# drop ,where missing values 40 %
df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)

In [None]:
# Also I'm deleted sample with "RainTomorrow" = NaN, because this is predict_value
df = df.drop(df[df['RainTomorrow'].isna()].index)
df = df.reset_index(drop=True)
df = df.fillna(df.groupby('Location').transform('median'))
df = df.fillna(df.median())

In [None]:
df_info(df, 'Rain in Australian')

In [None]:
#List of categorical features with NaN
features_obj_with_NaN = [i for i in df.columns if (df[i].dtypes == 'object' and df[i].isnull().values.any())]
#List of categorical features (delete  'Date' and 'RainTomorrow')
features_obj = [i for i in df.columns if df[i].dtypes == 'object']
features_obj.pop(0)
features_obj.pop()

In [None]:
# Replace NaN with the most popular word
for col in features_obj_with_NaN:
#     print(col, '----')
#     print(df[col].value_counts())
    df[col] = df[col].fillna(df[col].value_counts().index[0])

In [None]:
# Check!
df_info(df, 'Rain in Australian')

In [None]:
# Unbalanced data 
ax = df['RainTomorrow'].value_counts().plot.bar(color= ['lightgreen', 'blue']);

plt.title('Quantity of each class in the dataset', fontsize = 15)
plt.xticks(rotation=0)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
for i in ax.patches:
    if i.get_height() == 110316:
        ax.text(i.get_x() + 0.08, i.get_height()-10000, i.get_height(), fontsize=16,
                    color='black')
    else:
        ax.text(i.get_x() + 0.1, i.get_height()-10000, i.get_height(), fontsize=16,
                    color='white')
ax.set_xticklabels(['No rain','Rain!'])    
plt.show()

In [None]:
# Date processing
df['Date'] = pd.to_datetime(df['Date'])
df['day'] = df['Date'].dt.day
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year
df = df.drop(['Date'], axis=1)

In [None]:
# Processing of categorical features by the method 'One-Hot-Encoding'
df_cat = df[features_obj]
df = df.drop(features_obj, axis=1)
df_cat = pd.get_dummies(df_cat)
df[df_cat.columns] = df_cat

In [None]:
# split df into X and y 
y = df['RainTomorrow']
X = df.drop(['RainTomorrow'], axis=  1)
y = y.apply(lambda x : 1 if x == 'Yes' else 0)

In [None]:
del df

In [None]:
#Standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=17)

# 2. ML-models

In [None]:
# A function that outputs information: "F1-score, ROC-AUC"
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report, roc_auc_score  
def info_result(y_true, y_pred, model=None):
    roc_auc = roc_auc_score(y_true, y_pred) 
    table = classification_report(y_true, y_pred)  
    if model:
        print(f'model :{model}')
    print(f'ROC-AUC score: {roc_auc:.2%}')
    print(table)
    print('----')
          

In [None]:
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
models = [SGDClassifier(), 
          LogisticRegression(max_iter=200, n_jobs=-1, class_weight="balanced"),
          RidgeClassifier()]
for model in models:
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    info_result(y_test, pred, str(model))

In [None]:
del models

# 3. Random-forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
info_result(y_test, pred, str(model))

# 5. NeuralNetwork (Pytorch)

In [None]:
#importing all libraries
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np 
import math
import random
import torch.nn as nn
#fix_random_state
random.seed(17)
np.random.seed(17)
torch.manual_seed(17)
torch.cuda.manual_seed(17)

In [None]:
# Checking device: cuda or cpu
torch.cuda.is_available()
print(torch.cuda.get_device_name(0))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device 

In [None]:
#cast to torchTensor
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train.values)
y_test = torch.FloatTensor(y_test.values)
X_test = X_test.to(device)

In [None]:
# Let's create RainDataset with DataLoader

class AustralianRainDataset(Dataset):
    def __init__(self):
        self.x = X_train
        self.y = y_train
        self.n_samples = X_train.shape[0]
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    def __len__(self):
        return self.n_samples

In [None]:
dataset = AustralianRainDataset()
# first_data = dataset[0]
# x_sample, y_sample = first_data

In [None]:
# Example
# dataloader = DataLoader(dataset=dataset, batch_size = 4, shuffle= True)
# dataiter = iter(dataloader)
# data = dataiter.next()
# features,labels = data 
# print(features,labels)

In [None]:
#Create NetWork 
class NeuralNetwork(nn.Module):
    def __init__(self, first_neurons, n_hidden_neurons):
        super().__init__()
        self.fc1 = nn.Linear(first_neurons, n_hidden_neurons)
        self.activ1 = nn.ReLU()
        self.fc2 = nn.Linear(n_hidden_neurons, n_hidden_neurons)
        self.activ2 = nn.ReLU()
        self.fc3 = nn.Linear(n_hidden_neurons, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.activ1(x)
        x = self.fc2(x)
        x = self.activ2(x)
        x = self.fc3(x)
        return x


In [None]:
myNet = NeuralNetwork(114, 114)
myNet = myNet.to(device)
myNet.parameters()
# list(myNet.parameters())
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(myNet.parameters(),  lr = 10 **(-4))

In [None]:
from sklearn.metrics import *
acc_list = []
f_1_list = []
auc_list  = []

def training(model, batch_size, epochs, loss, optimizer):
    global acc_list, f_1_list, auc_list    
    for epoch in range(epochs):
        dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
        for (X_batch,y_batch) in dataloader:
            optimizer.zero_grad()
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            preds = model.forward(X_batch)
            loss_value = loss(preds, y_batch.long())
            loss_value.backward()
            optimizer.step()

        test_preds = model.forward(X_test)
        pred  = test_preds.argmax(dim=1).to('cpu')
        f1 = f1_score(y_test, pred)
        auc_score = roc_auc_score(y_test, pred)
        acc = accuracy_score(y_test, pred)
        acc_list.append(acc)
        f_1_list.append(f1)
        auc_list.append(auc_score)
        if (epoch + 1) % 5 == 0:
            print(f'epoch: {epoch + 1}, acc:{acc:.2%}, f1:{f1:.2%}, auc: {auc_score:.2%}')


In [None]:
%%time 
epoch = 100
batch_size = 400
training(myNet, batch_size, epoch, loss, optimizer)

In [None]:
acc_max = max(acc_list)
f1_max = max(f_1_list)
auc_max = max(auc_list)
acc_list = np.array(acc_list) * 10000 // 1 / 100
f_1_list = np.array(f_1_list) * 10000 // 1 / 100
auc_list  = np.array(auc_list) * 10000 // 1 / 100

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1,3,1)
plt.title('Accuracy', fontsize = 15 );
plt.grid(True)
plt.xlabel('epoch', fontsize=14)
plt.plot(acc_list, color='red', label = f'max_value:{acc_max:.2%}');
plt.legend();

plt.subplot(1,3,2)
plt.title('AUC_score', fontsize = 15 );
plt.grid(True)
plt.xlabel('epoch', fontsize=14)
plt.plot(auc_list , color='darkblue', label = f'max_value:{auc_max:.2%}');
plt.legend();

plt.subplot(1,3,3)
plt.title('F1-score', fontsize = 15 );
plt.grid(True)
plt.xlabel('epoch', fontsize=14)
plt.plot(f_1_list, color='darkorange', label = f'max_value:{f1_max:.2%}');
plt.legend();