# Rain Prediction
# Decision Trees and Random Forests 


In [None]:
!pip install pandas numpy matplotlib seaborn --quiet

In [None]:
!pip install opendatasets scikit-learn --quiet --upgrade

In [None]:
import opendatasets as od
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import os
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

## Downloading the Data

The dataset is available at https://www.kaggle.com/jsphyg/weather-dataset-rattle-package .

In [None]:
od.download('https://www.kaggle.com/jsphyg/weather-dataset-rattle-package')

In [None]:
os.listdir('weather-dataset-rattle-package')

In [None]:
raw_df = pd.read_csv('weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
raw_df

In [None]:
raw_df.info()

In [None]:
raw_df.dropna(subset=['RainTomorrow'], inplace=True)

## Preparing the Data for Training



### Training, Validation and Test Sets

In [None]:
plt.title('No. of Rows per Year')
sns.countplot(x=pd.to_datetime(raw_df.Date).dt.year);

In [None]:
year = pd.to_datetime(raw_df.Date).dt.year

train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

In [None]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

### Input and Target Columns



In [None]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [None]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [None]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [None]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

In [None]:
print(numeric_cols)

In [None]:
print(categorical_cols)

### Imputing missing numeric values

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy = 'mean').fit(raw_df[numeric_cols])

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [None]:
test_inputs[numeric_cols].isna().sum()

### Scaling Numeric Features

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(raw_df[numeric_cols])

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
val_inputs.describe().loc[['min', 'max']]

### Encoding Categorical Data

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(raw_df[categorical_cols])

In [None]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
test_inputs

In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
X_test

### Training

We can use `DecisionTreeClassifier` from `sklearn.tree` to train a decision tree.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state=42)

In [None]:
%%time
model.fit(X_train, train_targets)

### Evaluation


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
pd.value_counts(train_preds)

In [None]:
train_probs = model.predict_proba(X_train)

In [None]:
train_probs

In [None]:
accuracy_score(train_targets, train_preds)

In [None]:
model.score(X_val, val_targets)

In [None]:
val_targets.value_counts() / len(val_targets)

### Visualization



In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=2, filled=True);

In [None]:
model.tree_.max_depth

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(X_train.columns))
print(tree_text[:5000])

### Feature Importance



In [None]:
model.feature_importances_

In [None]:
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

## Hyperparameter Tuning and Overfitting

In [None]:
?DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(max_depth=3, random_state=42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model.classes_

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
print(export_text(model, feature_names=list(X_train.columns)))

In [None]:
def max_depth_error(md):
    model = DecisionTreeClassifier(max_depth=md, random_state=42)
    model.fit(X_train, train_targets)
    train_acc = 1 - model.score(X_train, train_targets)
    val_acc = 1 - model.score(X_val, val_targets)
    return {'Max Depth': md, 'Training Error': train_acc, 'Validation Error': val_acc}

In [None]:
%%time
errors_df = pd.DataFrame([max_depth_error(md) for md in range(1, 21)])

In [None]:
errors_df

In [None]:
plt.figure()
plt.plot(errors_df['Max Depth'], errors_df['Training Error'])
plt.plot(errors_df['Max Depth'], errors_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(range(0,21, 2))
plt.xlabel('Max. Depth')
plt.ylabel('Prediction Error (1 - Accuracy)')
plt.legend(['Training', 'Validation'])

In [None]:
model = DecisionTreeClassifier(max_depth=7, random_state=42).fit(X_train, train_targets)
model.score(X_val, val_targets)

In [None]:
model = DecisionTreeClassifier(max_leaf_nodes=128, random_state=42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model.tree_.max_depth

In [None]:
model_text = export_text(model, feature_names=list(X_train.columns))
print(model_text[:3000])

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_jobs=-1, random_state=42)

In [None]:
%%time
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
train_probs = model.predict_proba(X_train)
train_probs

In [None]:
model.estimators_[0]

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[0], max_depth=2, feature_names=X_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[20], max_depth=2, feature_names=X_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
len(model.estimators_)

In [None]:
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

## Hyperparameter Tuning with Random Forests



In [None]:
?RandomForestClassifier

In [None]:
base_model = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_train, train_targets)

In [None]:
base_train_acc = base_model.score(X_train, train_targets)
base_val_acc = base_model.score(X_val, val_targets)

In [None]:
base_accs = base_train_acc, base_val_acc
base_accs

In [None]:
model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=10)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
base_accs

In [None]:
model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500)
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
base_accs

In [None]:
def test_params(**params):
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **params).fit(X_train, train_targets)
    return model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
test_params(max_depth=5)

In [None]:
test_params(max_depth=26)

In [None]:
test_params(max_leaf_nodes=2**5)

In [None]:
test_params(max_leaf_nodes=2**20)

In [None]:
base_accs # no max depth or max leaf nodes

In [None]:
test_params(max_features='log2')

In [None]:
test_params(max_features=3)

In [None]:
test_params(max_features=6)

In [None]:
base_accs

In [None]:
test_params(min_samples_split=3, min_samples_leaf=2)

In [None]:
test_params(min_samples_split=100, min_samples_leaf=60)

In [None]:
base_accs

In [None]:
test_params(min_impurity_decrease=1e-7)

In [None]:
test_params(min_impurity_decrease=1e-2)

In [None]:
base_accs

In [None]:
test_params(bootstrap=False)

In [None]:
base_accs

In [None]:
test_params(max_samples=0.9)

In [None]:
base_accs

### `class_weight`

In [None]:
model.classes_

In [None]:
test_params(class_weight='balanced')

In [None]:
test_params(class_weight={'No': 1, 'Yes': 2})

In [None]:
base_accs

In [None]:
model = RandomForestClassifier(n_jobs=-1, 
                               random_state=42, 
                               n_estimators=500,
                               max_features=7,
                               max_depth=30, 
                               class_weight={'No': 1, 'Yes': 1.5})

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
base_accs

In [None]:
raw_df

In [None]:
model.score(X_test, test_targets)

In [None]:
def predict_input(model, single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
predict_input(model, new_input)

In [None]:
raw_df.Location.unique()

In [None]:
import joblib

In [None]:
aussie_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

In [None]:
joblib.dump(aussie_rain, 'aussie_rain.joblib')

In [None]:
aussie_rain2 = joblib.load('aussie_rain.joblib')

In [None]:
test_preds2 = aussie_rain2['model'].predict(X_test)
accuracy_score(test_targets, test_preds2)