In [1]:
#!pip3 install xgboost

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as ply
import seaborn as sns
import xgboost as xgb

import matplotlib

from sklearn.linear_model import LassoCV, Ridge, RidgeCV, ElasticNet, LassoLarsCV, LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from scipy.stats import skew
from scipy.stats.stats import pearsonr
import xgboost as xgb
from xgboost import XGBRegressor
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [3]:
#!pip3 install scikit-learn
#!pip3 install xgboost

In [4]:
data = pd.read_csv('data/weatherAUS.csv')
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [6]:
data.shape

(145460, 23)

## Drop unrelated features

In [7]:
cols_to_drop = ['Date', 'Location', 'RainTomorrow', 'Rainfall']
data.drop(cols_to_drop, axis=1, inplace=True)
data.shape

(145460, 19)

## Filling NANs

In [8]:
missing_props = data.isna().mean(axis=0)
missing_props

MinTemp          0.010209
MaxTemp          0.008669
Evaporation      0.431665
Sunshine         0.480098
WindGustDir      0.070989
WindGustSpeed    0.070555
WindDir9am       0.072639
WindDir3pm       0.029066
WindSpeed9am     0.012148
WindSpeed3pm     0.021050
Humidity9am      0.018246
Humidity3pm      0.030984
Pressure9am      0.103568
Pressure3pm      0.103314
Cloud9am         0.384216
Cloud3pm         0.408071
Temp9am          0.012148
Temp3pm          0.024811
RainToday        0.022419
dtype: float64

In [9]:
missing_props[missing_props < 0.4].index

Index(['MinTemp', 'MaxTemp', 'WindGustDir', 'WindGustSpeed', 'WindDir9am',
       'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Temp9am',
       'Temp3pm', 'RainToday'],
      dtype='object')

In [10]:
data = data[missing_props[missing_props < 0.4].index] # keep the columns that have less than 40% nan

In [11]:
data.isna().mean(axis=0)

MinTemp          0.010209
MaxTemp          0.008669
WindGustDir      0.070989
WindGustSpeed    0.070555
WindDir9am       0.072639
WindDir3pm       0.029066
WindSpeed9am     0.012148
WindSpeed3pm     0.021050
Humidity9am      0.018246
Humidity3pm      0.030984
Pressure9am      0.103568
Pressure3pm      0.103314
Cloud9am         0.384216
Temp9am          0.012148
Temp3pm          0.024811
RainToday        0.022419
dtype: float64

In [12]:
data.head()

Unnamed: 0,MinTemp,MaxTemp,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Temp9am,Temp3pm,RainToday
0,13.4,22.9,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,16.9,21.8,No
1,7.4,25.1,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,17.2,24.3,No
2,12.9,25.7,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,21.0,23.2,No
3,9.2,28.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,18.1,26.5,No
4,17.5,32.3,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,17.8,29.7,No


In [13]:
cols = data.columns
for col in cols:
    if data[col].dtypes == 'object':
        data[col] = data[col].fillna(data[col].mode().values[0])
    else:
        data[col] = data[col].fillna(data[col].mean())
data.head()
    

Unnamed: 0,MinTemp,MaxTemp,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Temp9am,Temp3pm,RainToday
0,13.4,22.9,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,16.9,21.8,No
1,7.4,25.1,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,4.447461,17.2,24.3,No
2,12.9,25.7,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,4.447461,21.0,23.2,No
3,9.2,28.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,4.447461,18.1,26.5,No
4,17.5,32.3,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,17.8,29.7,No


## Handle categorical vaiables

In [14]:
X = data.drop('RainToday', axis=1)
y = data['RainToday']

In [15]:
categorical_data = X.select_dtypes(include='object')
numerical_data = X.select_dtypes(exclude='object')

In [16]:
categorical_data

Unnamed: 0,WindGustDir,WindDir9am,WindDir3pm
0,W,W,WNW
1,WNW,NNW,WSW
2,WSW,W,WSW
3,NE,SE,E
4,W,ENE,NW
...,...,...,...
145455,E,SE,ENE
145456,NNW,SE,N
145457,N,SE,WNW
145458,SE,SSE,N


In [17]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
cat_num_data = ohe.fit_transform(categorical_data)

In [18]:
cat_num_data = pd.DataFrame(cat_num_data)
cat_num_data.columns = ohe.get_feature_names_out()

In [19]:
cat_num_data.head()

Unnamed: 0,WindGustDir_E,WindGustDir_ENE,WindGustDir_ESE,WindGustDir_N,WindGustDir_NE,WindGustDir_NNE,WindGustDir_NNW,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X = pd.concat([numerical_data, cat_num_data], axis=1)

In [21]:
y.replace({'No':0, 'Yes':1}, inplace=True)

In [22]:
y.value_counts()

0    113580
1     31880
Name: RainToday, dtype: int64

In [23]:
X.describe()

Unnamed: 0,MinTemp,MaxTemp,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
count,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,...,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0,145460.0
mean,12.194034,23.221348,40.03523,14.043426,18.662657,68.880831,51.539116,1017.64994,1015.255889,4.447461,...,0.054104,0.059192,0.068239,0.103575,0.064616,0.05607,0.064306,0.069504,0.061006,0.065434
std,6.36575,7.088124,13.118253,8.861059,8.716581,18.854765,20.471189,6.728467,6.663973,2.265604,...,0.226224,0.235984,0.252156,0.304709,0.245847,0.230058,0.245299,0.25431,0.239343,0.247291
min,-8.5,-4.8,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.7,18.0,31.0,7.0,13.0,57.0,37.0,1013.5,1011.1,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12.1,22.7,39.0,13.0,18.662657,69.0,51.539116,1017.64994,1015.255889,4.447461,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16.8,28.2,46.0,19.0,24.0,83.0,65.0,1021.8,1019.4,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,33.9,48.1,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
scale = StandardScaler()
scaled_X = scale.fit_transform(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, stratify=y, random_state=1121218)
# stratify keeps the ratio of the classes in both train and test 

In [26]:
Counter(y_train) 

Counter({0: 85185, 1: 23910})

In [27]:
Counter(y_test)

Counter({0: 28395, 1: 7970})

In [28]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)
preds = xgb_clf.predict(X_test)
print(f"Accuracy is: {accuracy_score(preds, y_test)}")
print(f"Classification report is: {classification_report(preds, y_test)}")

Accuracy is: 0.8507080984463082
Classification report is:               precision    recall  f1-score   support

           0       0.94      0.88      0.91     30494
           1       0.53      0.72      0.61      5871

    accuracy                           0.85     36365
   macro avg       0.73      0.80      0.76     36365
weighted avg       0.87      0.85      0.86     36365



In [29]:
import warnings
warnings.filterwarnings('ignore')

param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}
#xgb_cl = xgb.XGBClassifier(objective="binary:logistic")
xgb_cl = xgb.XGBClassifier()

xgb_clf_cv = GridSearchCV(xgb_cl, param_grid, cv=3, scoring='roc_auc').fit(X_train, y_train)



In [31]:
xgb_clf_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1,
 'learning_rate': 0.1,
 'max_depth': 7,
 'reg_lambda': 10,
 'scale_pos_weight': 5,
 'subsample': 0.8}

In [32]:
xgb_clf_cv.best_score_

0.8753347828318748

In [41]:
final_xgb_cl = xgb.XGBClassifier(
    **xgb_clf_cv.best_params_,
    objective="binary:logistic"
)
final_xgb_cl.fit(X_train, y_train)
test_pred = final_xgb_cl.predict_proba(X_test)
roc_auc_score(y_test, test_pred[:,1])



0.8813076860024704