In [None]:
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error

In [None]:
w_df = pd.read_csv('../data/weatherAUS.csv', delimiter=',')

In [None]:
w_df

In [None]:
w_df.info()

In [None]:
def missing_value(df):
    number = df.isnull().sum().sort_values(ascending=False)
    number = number[number > 0]
    percentage = df.isnull().sum() * 100 / df.shape[0]
    percentage = percentage[percentage > 0].sort_values(ascending=False)
    return pd.concat([number, percentage], keys=["Total", "Percentage"], axis=1)

pd.set_option('display.max_rows', 100)
missing_value(w_df)

In [None]:
x = w_df.copy().drop(["Sunshine", "Evaporation", "Cloud3pm", "Cloud9am"], axis = 1)

In [None]:
missing_value(x)

In [None]:
mean = x["Pressure9am"].mean()
x["Pressure9am"] = x["Pressure9am"].fillna(mean)

In [None]:
mean = x["Pressure3pm"].mean()
x["Pressure3pm"] = x["Pressure3pm"].fillna(mean)

In [None]:
mode = x["WindDir9am"].value_counts().index[0]
x["WindDir9am"] = x["WindDir9am"].fillna(mode)

In [None]:
mode = x["WindGustDir"].value_counts().index[0]
x["WindGustDir"] = x["WindGustDir"].fillna(mode)

In [None]:
mean = x["WindGustSpeed"].mean()
x["WindGustSpeed"] = x["WindGustSpeed"].fillna(mean)

In [None]:
mode = x["WindDir3pm"].value_counts().index[0]
x["WindDir3pm"] = x["WindDir3pm"].fillna(mode)

In [None]:
mean = x["Humidity3pm"].mean()
x["Humidity3pm"] = x["Humidity3pm"].fillna(mean)

In [None]:
mean = x["Temp3pm"].mean()
x["Temp3pm"] = x["Temp3pm"].fillna(mean)

In [None]:
mean = x["WindSpeed3pm"].mean()
x["WindSpeed3pm"] = x["WindSpeed3pm"].fillna(mean)

In [None]:
mean = x["Humidity9am"].mean()
x["Humidity9am"] = x["Humidity9am"].fillna(mean)

In [None]:
mode = x["RainToday"].value_counts().index[0]
x["RainToday"] = x["RainToday"].fillna(mode)

In [None]:
mean = x["Rainfall"].mean()
x["Rainfall"] = x["Rainfall"].fillna(mean)

In [None]:
mean = x["WindSpeed9am"].mean()
x["WindSpeed9am"] = x["WindSpeed9am"].fillna(mean)

In [None]:
mean = x["Temp9am"].mean()
x["Temp9am"] = x["Temp9am"].fillna(mean)

In [None]:
mean = x["MinTemp"].mean()
x["MinTemp"] = x["MinTemp"].fillna(mean)

In [None]:
mean = x["MaxTemp"].mean()
x["MaxTemp"] = x["MaxTemp"].fillna(mean)

In [None]:
missing_value(x)

In [None]:
x.info()

In [None]:
x = x.drop("Date", axis = 1)

In [None]:
x.info()

In [None]:
sns.heatmap(w_df.corr())

In [None]:
x["Location"].unique()

In [None]:
x = x.drop("Location", axis = 1)

In [None]:
x["WindGustDir"].unique()

In [None]:
level_map = {'W' : 1, 'WNW' : 2, 'WSW' : 3, 'NE' : 4, 'NNW' : 5, 'N' : 6, 'NNE' : 7, 'SW' : 8, 'ENE' : 9, 'SSE' : 10, 'S' : 11, 'NW' : 12, 'SE' : 13, 'ESE' : 14, 'E' : 15, 'SSW' : 16}
x['WindGustDir'] = x['WindGustDir'].map(level_map)

In [None]:
x["WindDir9am"].unique()

In [None]:
level_map = {'W' : 1, 'WNW' : 2, 'WSW' : 3, 'NE' : 4, 'NNW' : 5, 'N' : 6, 'NNE' : 7, 'SW' : 8, 'ENE' : 9, 'SSE' : 10, 'S' : 11, 'NW' : 12, 'SE' : 13, 'ESE' : 14, 'E' : 15, 'SSW' : 16}
x['WindDir9am'] = x['WindDir9am'].map(level_map)

In [None]:
x["WindDir3pm"].unique()

In [None]:
level_map = {'W' : 1, 'WNW' : 2, 'WSW' : 3, 'NE' : 4, 'NNW' : 5, 'N' : 6, 'NNE' : 7, 'SW' : 8, 'ENE' : 9, 'SSE' : 10, 'S' : 11, 'NW' : 12, 'SE' : 13, 'ESE' : 14, 'E' : 15, 'SSW' : 16}
x['WindDir3pm'] = x['WindDir3pm'].map(level_map)

In [None]:
x["WindDir3pm"].unique()

In [None]:
x["RainToday"].unique()

In [None]:
level_map = {'No' : 1, 'Yes' : 0}
x['RainToday'] = x['RainToday'].map(level_map)

In [None]:
x["RainTomorrow"].unique()

In [None]:
level_map = {'No' : 1, 'Yes' : 0}
x['RainTomorrow'] = x['RainTomorrow'].map(level_map)

In [None]:
x.isna().any().unique()

In [None]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

In [None]:
x

In [None]:
x['RainTomorrow'].unique()

In [None]:
y = x.copy().iloc[:, 17].values
X = x.copy().iloc[:, :-1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y)
print(Counter(y_train).values())
print(Counter(y_test).values())

In [None]:
cbc = CatBoostClassifier(learning_rate=0.2).fit(X_train, y_train)
print_classification_model_metrics(cbc, y_test, cbc.predict(X_test))

In [None]:
cbc.save_model('../models/cbc')