In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Carregando os dados
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

df.shape

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
# Copiando o dataframe
df2 = df.copy()

## Transformar coluna Date

In [None]:
df['Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d')

In [None]:
df.info()

# Feature Engineering

In [None]:
# Criando novas colunas com a data
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] =  df['Date'].dt.day
df['Dayofweek'] = df['Date'].dt.dayofweek

In [None]:
df.tail().T

In [None]:
# Retirando NAs do Target
df.dropna(subset=['RainTomorrow'], inplace=True)

In [None]:
df['RainToday'].unique()

In [None]:
df['RainToday'].value_counts()

In [None]:

import matplotlib.pyplot as plt

In [None]:
plt.style.use('fivethirtyeight')

chuvahj = df['RainToday'].value_counts()

fig = plt.figure(figsize=(15,9))
ax = fig.add_axes([0,0,1,1])
x = np.arange(2)
ax.bar(x, chuvahj, color='firebrick', edgecolor='k')

ax.set_xticks(x)
ax.set_xticklabels(['Não', 'Sim'], fontsize=15, fontweight= 'ultralight', fontname='Courier New')
ax.set_title('Chuva', fontsize=30, fontweight= 'ultralight', fontname='Courier New', pad= 15)
plt.show()


In [None]:
chuva = df.query("RainToday == 'Yes'")
chuva

In [None]:
df['RainToday'].unique()

In [None]:
dia_chuva = chuva['RainToday'].groupby(chuva['Dayofweek']).count()

# fazer gráfico
dia_chuva.plot(figsize=(12,7), color='firebrick', marker = 'o', linestyle = 'dashed')

ticks = list(range(0, 7)) # pontos do eixo x
labels = "Seg Ter Qua Qui Sex Sáb Dom".split()
plt.xticks(ticks, labels)

# título e labels
plt.title('Dias da Semana que chove', fontsize=20, pad= 20)
plt.xlabel('Dias da Semana',fontsize=15, labelpad=15)
plt.ylabel('Número de dias',fontsize=15, labelpad=15);

# Limpeza e preparação de dados p/ a modelagem

In [None]:
df.info()

In [None]:
# apagando algumas colunas

df = df.drop(['Date', 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)

In [None]:
# separando as colunas em categóricas e numéricas

cat = df.select_dtypes(include = ['object'])
num = df.select_dtypes(exclude = ['object'])

In [None]:
# preenchendo os NAs das colunas categóricas com a moda

for col in cat.columns:
    moda = cat[col].mode()[0]
    cat[col].fillna(moda, inplace = True)

In [None]:
# preenchendo os NAs das colunas numéricas com a mediana

for col in num.columns:
    mediana = num[col].median()
    num[col].fillna(mediana, inplace = True)

In [None]:
cat.info()

In [None]:
num.info()

In [None]:
cat['Location'].unique()

In [None]:
cat['WindGustDir'].unique()

In [None]:
cat['WindDir9am'].unique()

In [None]:
cat['WindDir3pm'].unique()

In [None]:
# transformando as colunas categóricas em numéricas


from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
cat = cat.astype(str).apply(label.fit_transform)

In [None]:
# novo df com colunas cat e num

df = pd.concat([cat, num], axis = 1)

In [None]:
df.head().T

# Treinar modelo



In [None]:
# Treinamento do modelo

# Separando o dataframe

# Importando o train_test_split
from sklearn.model_selection import train_test_split

# Separando treino e teste
train, test = train_test_split(df, test_size=0.20, random_state=42)

# Separando treino e validação
train, valid = train_test_split(train, test_size=0.20, random_state=42)

train.shape, valid.shape, test.shape

In [None]:
# definindo colunas de entrada
feats = [c for c in df.columns if c not in ['RainTomorrow']]

feats

In [None]:
# treinar o modelo

# Importando o modelo
from sklearn.ensemble import RandomForestClassifier

# Instanciar o modelo
rf = RandomForestClassifier(n_estimators=200, random_state=42)

In [None]:
# treinar o modelo
rf.fit(train[feats], train['RainTomorrow'])

In [None]:
# Prevendo os dados de validação
preds_val = rf.predict(valid[feats])

preds_val

In [None]:
# Avaliando o desempenho do modelo

# Importando a metrica
from sklearn.metrics import accuracy_score

In [None]:
# Acurácia das previsões de validação
accuracy_score(valid['RainTomorrow'], preds_val)

In [None]:
# Medindo a acurácia nos dados de teste
preds_test = rf.predict(test[feats])

accuracy_score(test['RainTomorrow'], preds_test)

In [None]:
# Olhando a coluna  RainTomorrow do dataframe completo
df['RainTomorrow'].value_counts(normalize=True)

In [None]:
# importando a bilbioteca para plotar o gráfico de Matriz de Confusão
import scikitplot as skplt

# Matriz de Confusão - Dados de Validação
skplt.metrics.plot_confusion_matrix(valid['RainTomorrow'], preds_val)

In [None]:

# Matriz de Confusão - Dados de Teste
skplt.metrics.plot_confusion_matrix(test['RainTomorrow'], preds_test)