## Descrição
Este dataset consiste na observação de 10 anos de [dados climáticos diários](http://www.bom.gov.au/climate/data) de diversos locais da Austrália, com o objetivo de prever se irá chover ou não no dia seguinte.

## Importando bibliotecas necessárias

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, OneHotEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importando os dados

In [None]:
data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
data.head()

## Explicando as variáveis

[Fonte](http://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml)

1. Date = Data
2. Location = Cidade da Austrália
3. MinTemp = Temperatura mínima (graus Celsius)
4. MaxTemp = Temperatura máxima (graus Celsius)
5. Rainfall = Precipitação (ml)
6. Evaporation = ["Class A" pan evaporation](https://en.wikipedia.org/wiki/Pan_evaporation), padrão americano que mede a evaporação de água (ml)
7. Sunshine = Horas de luz solar
8. WindGustDir = Direção da rajada de vento mais forte (16 pontos da rosa dos ventos)
9. WindGustSpeed = Velocidade da rajada de vento mais forte (km/h)
10. WindDir9am = Direção da rajada de vento mais forte às 9am (pontos da rosa dos ventos)
11. WindDir3pm = Direção da rajada de vento mais forte às 3pm (pontos da rosa dos ventos)
12. WindSpeed9am = Velocidade da rajada de vento mais forte às 9am (km/h)
13. WindSpeed3pm = Velocidade da rajada de vento mais forte às 3pm (km/h)
14. Humidity9am = Umidade Relativa às 9am (%)
15. Humidity3pm = Umidade Relativa às 3pm (%)
16. Pressure9am = Pressão Atmosférica às 9am (hPa)
17. Pressure3pm = Pressão Atmosférica às 3pm (hPa)
18. Cloud9am = Fração nublada do céu às 9am (oitavos)
19. Cloud3pm = Fração nublada do céu às 3pm (oitavos)
20. Temp9am = Temperatura às 9am (graus Celsius)
21. Temp3pm = Temperatura às 3pm (graus Celsius)
22. RainToday = Se choveu ou não no dia
23. RainTomorrow = Variável alvo




## Número de registros e variáveis

In [None]:
print('Número de registros =',len(data['RainTomorrow']))
print('Número de variáveis =',len(data.columns)-1)

## Estatísticas básicas

In [None]:
data.describe()

## Porcentagem de valores ausentes

In [None]:
data.isnull().mean()*100

### Explorando Variáveis

### Variáveis categoricas

In [None]:
s = (data.dtypes == "object")
cathegorical_variables = list(s[s].index)
data[cathegorical_variables].describe()

1. Location: 49 únicos (49 cidades)
2. Wind: Posiçoes cardeais e subcolaterais

In [None]:
print(data['Date'].value_counts())

### Variáveis numericas

In [None]:
s = (data.dtypes == "float64")
numeric_variables = list(s[s].index)
print(numeric_variables)

## Visualização

In [None]:
sns.countplot(x = data['RainTomorrow'])
plt.suptitle('Variável Alvo', fontsize= 20)

In [None]:
plt.figure(figsize = (8,4))
sns.countplot(x = 'RainToday', hue = 'RainTomorrow', data = data,
             edgecolor=(.23,.78,.5), linewidth = 2, palette="prism")

Muitos dos casos em que choveu no dia, choveu no dia seguinte (aproximadamente 50%)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Variáveis Categóricas', fontsize = 32)

sns.countplot(ax = axs[0][0], x = data['RainToday'])
sns.countplot(ax = axs[0][1], x = data['WindGustDir'])
sns.countplot(ax = axs[1][0], x = data['WindDir9am'])
sns.countplot(ax = axs[1][1], x = data['WindDir3pm'])

### Box-Plots

In [None]:
box_data = data[numeric_variables]

plt.figure(figsize=(20,10))
sns.boxenplot(data = box_data, color='#729BED')
plt.xticks(rotation=90)
plt.show()

### Histogramas
**Escala log aplicada em Rainfall, Evaporation, Sunshine, WindSpeed9am, Cloud9am e Cloud3pm**

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(16, 12))

axs[0][0].hist(data['MinTemp'], color = '#729BED')
axs[0][0].set_title('MinTemp')

axs[0][1].hist(data['MaxTemp'], color = '#729BED')
axs[0][1].set_title('MaxTemp')

axs[0][2].hist(data['Rainfall'], log = True, color = '#729BED')
axs[0][2].set_title('Rainfall')

axs[0][3].hist(data['Evaporation'], log = True, color = '#729BED')
axs[0][3].set_title('Evaporation')

axs[1][0].hist(data['Sunshine'], log = True, color = '#729BED')
axs[1][0].set_title('Sunshine')

axs[1][1].hist(data['WindGustSpeed'], color = '#729BED')
axs[1][1].set_title('WindGustSpeed')

axs[1][2].hist(data['WindSpeed9am'], log = True, color = '#729BED')
axs[1][2].set_title('WindSpeed9am')

axs[1][3].hist(data['WindSpeed3pm'], color = '#729BED')
axs[1][3].set_title('WindSpeed3pm')

axs[2][0].hist(data['Humidity9am'], color = '#729BED')
axs[2][0].set_title('Humidity9am')

axs[2][1].hist(data['Humidity3pm'], color = '#729BED')
axs[2][1].set_title('Humidity3pm')

axs[2][2].hist(data['Pressure9am'], color = '#729BED')
axs[2][2].set_title('Pressure9am')

axs[2][3].hist(data['Pressure3pm'], color = '#729BED')
axs[2][3].set_title('Pressure3pm')

axs[3][0].hist(data['Cloud9am'], log = True, color = '#729BED')
axs[3][0].set_title('Cloud9am')

axs[3][1].hist(data['Cloud3pm'], log = True, color = '#729BED')
axs[3][1].set_title('Cloud3pm')

axs[3][2].hist(data['Temp9am'], color = '#729BED')
axs[3][2].set_title('Temp9am')

axs[3][3].hist(data['Temp3pm'], color = '#729BED')
axs[3][3].set_title('Temp3pm')

fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
                    hspace=0.4, wspace=0.3)

### Matriz de correlação

In [None]:
corr = data.corr()
plt.subplots(figsize=(16,12))
sns.heatmap(corr, annot=True, cmap = 'RdBu', vmin=-1)

# Processando dados

### Date
Separando Date em Year e Month e dropando Date.

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

data['Year'] = data['Date'].dt.year
print(data['Year'].head())

data['Month'] = data['Date'].dt.month
print(data['Month'].head())

data['Day'] = data['Date'].dt.day
print(data['Day'].head())

data.drop('Date', axis=1, inplace = True)
cathegorical_variables.remove('Date')

In [None]:
data.head()

### Substituindo valores ausentes
Estudar melhores formas de imputação

#### Variáveis Categóricas

Substituindo valores ausentes nas variáveis categóricas pela moda

In [None]:
data[cathegorical_variables].isnull().sum()

In [None]:
null_cathegorical_variables = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']

In [None]:
for col in null_cathegorical_variables:
    data[col].fillna(data[col].mode()[0], inplace= True)
    
data.isnull().sum()

#### Variáveis Numéricas
Substituindo valores ausentes em variáveis numéricas pela mediana, resistente a outliers

In [None]:
for col in numeric_variables:
    data[col].fillna(data[col].median(), inplace= True)
    
data.isnull().sum()

## Codificação de variáveis categóricas

### Variáveis Binárias

In [None]:
lb = LabelBinarizer()
data['RainToday'] = lb.fit_transform(data['RainToday'])
data['RainTomorrow'] = lb.fit_transform(data['RainTomorrow'])

### One-Hot Encoding

In [None]:
for col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    data = pd.concat([data, pd.get_dummies(data[col])], axis=1)
    directions = {'E': col + '_E', 
     'ENE': col + '_ENE', 
     'ESE': col + '_ESE', 
     'N': col + '_N', 
     'NE': col + '_NE', 
     'NNE': col + '_NNE',
     'NNW': col + '_NNW',
     'NW': col + '_NW',
     'S': col + '_S',
     'SE': col + '_SE',
     'SSE': col + '_SSE',
     'SSW': col + '_SSW',
     'SW': col + '_SW',
     'W': col + '_W',
     'WNW': col + '_WNW',
     'WSW': col + '_WSW',
    }
    
    data.rename(columns = directions, inplace = True)

for col in data.columns:
    print (col)

### Remove WindGustDir, WindDir9am and WindDir3pm

In [None]:
data = data.drop(['WindGustDir', 'WindDir9am', 'WindDir3pm'], axis=1)

## Remoção de outliers

In [None]:
data.describe()

### Cloud3pm e Cloud9am
Alguns valores = 9.0, o que é incoerente, visto que os valores de Cloud são em oitavos. 9/8 indicaria mais de 100% do céu nublado.

In [None]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

data['Cloud3pm'] = max_value(data, 'Cloud3pm', 8.0)
data['Cloud9am'] = max_value(data, 'Cloud9am', 8.0)
data.describe()

### Escalando Variáveis

In [None]:
scaler = MinMaxScaler()
not_scalable_cols = ['Location', 'Day', 'Month', 'Year']

scaled_data = data.copy()

scaled_data = data.drop(not_scalable_cols, axis= 1)
scaled_cols = scaled_data.columns
scaled_data = scaler.fit_transform(scaled_data)

scaled_data = pd.DataFrame(scaled_data, columns=scaled_cols)
data = pd.concat([scaled_data, data[['Location', 'Day', 'Month', 'Year']]], axis= 1)

In [None]:
data.describe()

## BoxPlot

In [None]:
box_data = data[numeric_variables]

plt.figure(figsize=(20,5))
sns.boxenplot(data = box_data, color='#729BED')
plt.xticks(rotation=90)
plt.show()

## Ocorrência de chuva por ano

## Selecionando localidades
Como temos dados de 49 cidades e a Austrália é muito extensa geográficamente, selecionaremos 3 cidades próximas umas das outras em busca de melhores previsões.

In [None]:
print('Location contains ', len(data['Location'].unique()),'labels')
print(data.Location.value_counts())
mainLocations = ['Sydney','Canberra','Newcastle']
mainData = data[data["Location"].isin(mainLocations)]
mainData.shape

Dentre as 5 localidades com mais medidas, escolhemos **Sydney**, **Canberra** e **Newcastle** por serem próximas geográficamente.
Agora temos 9819 registros.

In [None]:
sns.set_style('darkgrid')
#sns.set_palette('Set1')
plt.figure(figsize=[15,6])
ax = sns.countplot(x = 'Year', hue = 'RainTomorrow',edgecolor=(0,0,0),
                  linewidth=2,
              palette="Accent_r", data = mainData)
ax.set_title( "Occurence of rain in Years",size = 30 )
plt.show()

In [None]:
mainData.describe()

In [None]:
box_data = mainData[numeric_variables]

plt.figure(figsize=(20,5))
sns.boxenplot(data = box_data, color='#729BED')
plt.xticks(rotation=90)
plt.show()

In [None]:
mainData = mainData.reset_index(drop= True)
mainData.head()

### Adicionando Rain Yesterday

In [None]:
counter = 0
rain_yesterday = []
for i in range(len(mainData['Day'])):
    if i == 0:
        rain_yesterday.append(0)
    elif mainData['Day'][i] == (mainData['Day'][i-1] + 1):
        rain_yesterday.append(mainData['RainToday'][i-1])
    else:
        rain_yesterday.append(0)

mainData['RainYesterday'] = rain_yesterday
mainData.describe()

# Modelos Preditivos

In [None]:
from sklearn.model_selection import cross_val_score

### Removendo colunas Location, Year e Day

In [None]:
mainData.drop(['Location','Year','Day'], axis=1, inplace=True)

### Separação de dados de treino e teste
O conjunto de teste terá 33% dos dados e os dados serão embaralhados

In [None]:
from sklearn.model_selection import train_test_split
features = mainData.loc[:, mainData.columns != 'RainTomorrow'].values
target   = mainData.loc[:, mainData.columns == 'RainTomorrow'].values.ravel()
x_train, x_test, y_train, y_test = train_test_split(features, target, shuffle= True, test_size=0.33, random_state=42)

## Regressão Logística

### Otimizando hiperparâmetros

In [None]:
from sklearn.linear_model import LogisticRegression

for i in range(-1, 5):
    
    c = 10**i
    
    clf = LogisticRegression(
        C=c,
        solver= 'liblinear',
        random_state=0).fit(x_train, y_train)
    
    print('C= ', c, ' train score = ', clf.score(x_train, y_train))
    print('C= ', c, ' test score = ', clf.score(x_test, y_test), '\n')

Escolhemos C = 1000, penalty = 'l2' e solver= 'liblinear' por ter apresentado o mehor resultado no conjunto de teste

In [None]:
clfLR = LogisticRegression(
        C=1000,
        solver= 'liblinear',
        random_state=0)

clfLR.fit(x_train, y_train)

In [None]:
y_pred = clfLR.predict(x_test)

### Validação Cruzada
Obtivemos melhores resultados utilizando 15 ciclos de validação cruzada

In [None]:
cv = cross_val_score(clfLR,features,target,cv=15,scoring='accuracy')
print('Acurácia média: %0.2f%% +- %0.2f%%'%(np.mean(cv)*100,np.std(cv)*100))

In [None]:
cv_df = pd.DataFrame({'Ciclo': range(1,16), 'Acurácia': cv })
cv_df.describe()

Acurácia máxima = 87,4618 %

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

## Acurácia de Classificação

### Matriz de Confusão

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
from sklearn.metrics import plot_confusion_matrix

disp = plot_confusion_matrix(
            clfLR, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Logistic Regression')
plt.grid(False)
plt.show()

In [None]:
y_pred_prob = clfLR.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

### Distribuição de probabilidades
Como a distribuição de probabilidades está concentrada entre 0 e 0.2, a diminuição do limite de decisão de classificação (padrão 0.5) pode aumentar significativamente a quantidade de verdadeiros positivos

In [None]:
plt.rcParams['font.size'] = 12
plt.hist(y_pred_prob, bins = 10)
plt.title('Histogram of predicted probabilities of rain')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of rain')
plt.ylabel('Frequency')

### ROC e AUC

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Logistic Regression')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


naiveBayesClf = GaussianNB()

naiveBayesClf.fit(x_train, y_train)
y_pred = naiveBayesClf.predict(x_test)
    
print(classification_report(y_test, y_pred))

#########################################3

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

disp = plot_confusion_matrix(
            naiveBayesClf, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Naive Bayes')
plt.grid(False)
plt.show()

y_pred_prob = naiveBayesClf.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Naive Bayes')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

decisionTreeClf = DecisionTreeClassifier()

parameters = dict(
                   max_features = ['auto','sqrt', 'log2'],
                   class_weight = [None, 'balanced']
                  )

decisionTreeClf = GridSearchCV(decisionTreeClf, parameters)
decisionTreeClf_Trained = decisionTreeClf.fit(x_train, y_train)

print('Melhor combinação de parâmetros encontrada:')
print(decisionTreeClf_Trained.best_params_)

y_pred = decisionTreeClf_Trained.predict(x_test)    
print(classification_report(y_test, y_pred))

#########################################3

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

disp = plot_confusion_matrix(
            decisionTreeClf, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Decision Tree')
plt.grid(False)
plt.show()

y_pred_prob = decisionTreeClf.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Decision Tree')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomForestClf = RandomForestClassifier()

parameters = dict(
                   n_estimators = [10, 20, 50, 100],
                   max_features = ['sqrt', 'log2'],
                   #warm_start = [True, False],
                   class_weight = [None, 'balanced'] 
                  )

randomForestClf = GridSearchCV(randomForestClf, parameters)
randomForestClf_Trained = randomForestClf.fit(x_train, y_train)

print('Melhor combinação de parâmetros encontrada:')
print(randomForestClf_Trained.best_params_)

y_pred = randomForestClf_Trained.predict(x_test)
print(classification_report(y_test, y_pred))

#########################################3

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

disp = plot_confusion_matrix(
            randomForestClf, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Decision Tree')
plt.grid(False)
plt.show()

y_pred_prob = randomForestClf.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Random Forest')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))

### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradientBoostingClf = GradientBoostingClassifier()

parameters = dict(
                   loss = ['deviance', 'exponential'],
                   n_estimators = [100, 200],
                   max_depth = [3, 5],
                   max_features = ['sqrt', 'log2'] 
                  )

gradientBoostingClf = GridSearchCV(gradientBoostingClf, parameters)
gradientBoostingClf_Trained = gradientBoostingClf.fit(x_train, y_train)

print('Melhor combinação de parâmetros encontrada:')
print(gradientBoostingClf_Trained.best_params_)

y_pred = gradientBoostingClf.predict(x_test)   
print(classification_report(y_test, y_pred))

#########################################3

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

disp = plot_confusion_matrix(
            gradientBoostingClf, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Gradient Boosting')
plt.grid(False)
plt.show()

y_pred_prob = gradientBoostingClf.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Gradient Boosting')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC

svcClf = SVC()

parameters = dict(
                   gamma = ['scale', 'auto'],
                   class_weight = [None, 'balanced'],
                   probability = [True] 
                  )

svcClf = GridSearchCV(svcClf, parameters)
svcClf_Trained = svcClf.fit(x_train, y_train)

print('Melhor combinação de parâmetros encontrada:')
print(svcClf_Trained.best_params_)

y_pred = svcClf.predict(x_test)   
print(classification_report(y_test, y_pred))

#########################################3

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

disp = plot_confusion_matrix(
            svcClf, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Support-Vector Machine')
plt.grid(False)
plt.show()

y_pred_prob = svcClf.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Support-Vector Machine')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))

### Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

neuralNetworksClf = MLPClassifier()

parameters = dict(
                   n_estimators = [100, 200, 400, 1000],
                   max_depth = [3, 6, 10]
                  )

neuralNetworksClf = GridSearchCV(neuralNetworksClf, parameters)
neuralNetworksClf_Trained = neuralNetworksClf.fit(x_train, y_train)

print('Melhor combinação de parâmetros encontrada:')
print(neuralNetworksClf_Trained.best_params_)

y_pred = neuralNetworksClf.predict(x_test)   
print(classification_report(y_test, y_pred))

#########################################3

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

disp = plot_confusion_matrix(
            neuralNetworksClf, 
            x_test, 
            y_test,
            cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix - Neural Networks')
plt.grid(False)
plt.show()

y_pred_prob = neuralNetworksClf.predict_proba(x_test)[:,1]
y_pred_prob = y_pred_prob.reshape(-1,1)
print(y_pred_prob)

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for RainTomorrow classifier - Neural Networks')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

ROC_AUC = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC : {:.2f}%'.format(ROC_AUC*100))