In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv', parse_dates=['Date'])

# 1. **We need to explore the data in more details**

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum() / len(df)

# What's we could understand ? 
* We have 145460 entries and 23 columns
* We have some NA values but not more than 50% of column's
* In data have 6 object's type, other is float64, and 1 is datetime64

**First at all, let's convert our date columns to make more data about dates and split this column**

In [None]:
df['Year'] = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Day'] = df.Date.dt.day
df['Dayofweek'] = df.Date.dt.dayofweek
df['Dayofyear'] = df.Date.dt.dayofyear

# Drop original columns Date
df.drop('Date', axis=1, inplace=True)

In [None]:
# Explore the data

df.head()

# Convert strings (objects) to categories
**We'll use API pandas**

In [None]:
# Check which columns have string:

for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

**Okay, 6 objects have a string type**

In [None]:
# This will turn all of the string values into category values

for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype('category').cat.as_ordered()

In [None]:
# Exploration

df.info()

# Now, time for fill missing values
* We must know, that all our data must be numerical
* There can't be any missing values

In [None]:
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check which columns have a null values

for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Fill numeric rows with the median

for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            df[label+'_is_missing'] = pd.isnull(content)
            df[label] = content.fillna(content.median())

In [None]:
# Check again
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
df.head()

In [None]:
df.isna().sum() / len(df)

Let's use dropna() for remove rows with NaN.

The big advantage is that we won't lose a lot of data.

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum() / len(df)

In [None]:
df.info()

# Now, time for tune categorical values into numerical

In [None]:
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        df[label+'_is_missing'] = pd.isnull(content)
        df[label] = pd.Categorical(content).codes+1

# Why codes+1 ? Because pandas encodes missing categories as -1

In [None]:
# Explore our data

df.info()

In [None]:
df.head()

# Let's see correlation matrix

In [None]:
df_tmp = df.copy()

In [None]:
# Correlation matrix

corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(30,20))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidth=0.5,
                 fmt='.2f',
                 cmap='YlGnBu')

# Predicting Modelling
**We'll use:**
* Logistic Regression
* KNearest Neighbors
* Decision Trees
* Random Forest
* Naive Bayes

But, before we start, me must to do train_test_split for split our data on X and y

In [None]:
# Train_test_split

from sklearn.model_selection import train_test_split

features = df.drop('RainTomorrow',axis=1)  # Our X set
target = df['RainTomorrow']  # Our y set

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [None]:
# Logistic Regression
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
print(accuracy_score(y_test, lr_predict))
print(classification_report(y_test, lr_predict))

In [None]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)
print(accuracy_score(y_test, knn_predict))
print(classification_report(y_test, knn_predict))

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train, y_train)
tree_predict = tree.predict(X_test)
print(accuracy_score(y_test, tree_predict))
print(classification_report(y_test, tree_predict))

In [None]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=7)
forest.fit(X_train, y_train)
forest_predict = forest.predict(X_test)
print(accuracy_score(y_test, forest_predict))
print(classification_report(y_test, forest_predict))

In [None]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

bayes = GaussianNB()
bayes.fit(X_train, y_train)
bayes_predict = bayes.predict(X_test)
print(accuracy_score(y_test, bayes_predict))
print(classification_report(y_test, bayes_predict))

In [None]:
# XGBoost

import xgboost as xgb

xgb = xgb.XGBClassifier()
xgb.fit(X_train, y_train)
xgb_predict = xgb.predict(X_test)
print(accuracy_score(y_test, xgb_predict))
print(classification_report(y_test, xgb_predict))

In [None]:
models_default_scores = {
    'Logistic Regression' : lr.score(X_test, y_test),
    'KNearest Neighbors' : knn.score(X_test, y_test),
    'Decision Tree' : tree.score(X_test, y_test),
    'Random Forest Classifier' : forest.score(X_test, y_test),
    'Naive Bayes GNB' : bayes.score(X_test, y_test),
    'XGBoost' : xgb.score(X_test, y_test)
}

In [None]:
models_default_scores

In [None]:
default_models_compare = pd.DataFrame(models_default_scores, index=['accuracy'])
default_models_compare.T.plot.bar()

# Hyperparameter tuning with RandomizedSearchCV

In [None]:
# Logistic Regression Grid
lr_grid = {'C' : np.logspace(-4,4,20),
           'solver' : ['liblinear', 'saga']}

# Random Forest Classifier Grid
forest_grid = {'n_estimators' : np.arange(10,600,10),
               'max_depth' : np.arange(1,12,1),
               'min_samples_leaf' : np.arange(2,14,2),
               'min_samples_split' : np.arange(2,14,2)}

# Decision Tree Grid

tree_grid = {'max_depth' : np.arange(1,9,1),
             'max_features' : np.arange(1,12,1),
             'min_samples_leaf' : np.arange(1,9,1),
             'criterion' : ['gini','entropy']}

# Tune Logistic Regression with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

np.random.seed(7)

lr_cv = RandomizedSearchCV(LogisticRegression(),
                           param_distributions=lr_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

lr_cv.fit(X_train, y_train)

In [None]:
lr_cv.best_params_

In [None]:
lr_cv.score(X_test, y_test)

In [None]:
lr_y_preds = lr_cv.predict(X_test)

# Tune Random Forest Classifier with RandomizedSearchCV

In [None]:
np.random.seed(7)

forest_cv = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1,
                                                      max_samples=10000),
                               param_distributions=forest_grid,
                               cv=5,
                               n_iter=20,
                               verbose=True)

forest_cv.fit(X_train, y_train)

In [None]:
forest_cv.best_params_

In [None]:
forest_cv.score(X_test, y_test)

In [None]:
forest_y_preds = forest_cv.predict(X_test)

# Tune Decision Tree Classifier with Randomized Search

In [None]:
np.random.seed(7)

tree_cv = RandomizedSearchCV(DecisionTreeClassifier(),
                             param_distributions=tree_grid,
                             cv=5,
                             n_iter=20,
                             verbose=True)

tree_cv.fit(X_train, y_train)

In [None]:
tree_cv.score(X_test, y_test)

In [None]:
tree_y_preds = tree_cv.predict(X_test)

In [None]:
updated_models_scores = {
    'Logistic Regression' : lr_cv.score(X_test, y_test),
    'KNearest Neighbors' : knn.score(X_test, y_test),
    'Decision Tree' : tree_cv.score(X_test, y_test),
    'Random Forest Classifier' : forest_cv.score(X_test, y_test),
    'Naive Bayes GNB' : bayes.score(X_test, y_test),
    'XGBoost' : xgb.score(X_test, y_test)
}

In [None]:
# Before
models_default_scores

In [None]:
# After
updated_models_scores

In [None]:
# As we can see, we could improve our Logistic Regression, Decision Tree and Random Forest Classifier

In [None]:
# Best of best model is XGBoost

from sklearn import metrics

metrics.plot_roc_curve(xgb, X_test, y_test)

In [None]:
# Our AUC score is 0.90 ! It's good result.

print(metrics.confusion_matrix(y_test, xgb_predict))

In [None]:
# Let's print classification report

print(classification_report(y_test, xgb_predict))

# Finally, our model have 86% of accuracy, 
# this is a very good result. Please, if you like my work, you can rate it and leave a comment, I will be very pleased.