In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the dataset and explore

In [None]:
train=pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv',index_col=0)

In [None]:
print(train.dtypes)
print(train.shape)
print(train.describe(include='all'))

In [None]:
#Check if there are missing values in the target variable if true then drop them
print(train['RainTomorrow'].isna().sum()/len(train))
train=train.dropna(axis=0,subset=['RainTomorrow'])

In [None]:

print(train.shape)

# Divide the dataset into X and y

In [None]:
y=train[['RainTomorrow']]
X=train.drop(['RainTomorrow'],axis=1)

In [None]:
print(X.shape)

# Let us perform EDA on the dataset

In [None]:
#Let us define few functions for visualization

import seaborn as sns
import matplotlib.pyplot as plt

def boxplot(df):
    plt.figure(figsize=(10,6))
    plt.title('Boxplot')
    sns.boxplot(df)
    plt.show()

def histogram(df):
    plt.figure(figsize=(10,6))
    sns.displot(df,kde=True)
    plt.title('Histogram')
    sns.despine()
    plt.show

def countplot(df):
    plt.figure(figsize=(10,6))
    sns.countplot(df,palette='spring')
    plt.title('Countplot')
    plt.show()
    

In [None]:
print(X.dtypes)

In [None]:
#Visualize the numeric variables (Boxplot)

num_col=X.select_dtypes(include=[np.number]).columns
num_cols=[n for n in num_col]
 
for n in num_cols:
    boxplot(X[n])

From boxplot we can see the only 4 columns have no outliers. Sunshine,WindSpeed3pm , Cloud9am and Cloud3am. In the next step we will visualize the distribution of the variables.

In [None]:
#Distribution of the numeric variables

for n in num_cols:
    histogram(X[n])

In [None]:
print(num_cols)

The columns ['Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am','Cloud9am', 'Cloud3pm'] are not symmetric in nature. We'll apply preprocessing later. We'll not that much be oncerned about outlers as we'll implement Boosting learner.

In [None]:
#Count plot of the categorical variables
cat_col=X.select_dtypes(exclude=[np.number]).columns
cat_cols=[ c for c in cat_col]

for c in cat_cols:
    countplot(X[c])


We see thet the RainToday column is imbalanced.

# Check for missing values


In [None]:
print(X.select_dtypes(include=[np.number]).isna().sum()/len(X))
print('************************************************')
print(X.select_dtypes(exclude=[np.number]).isna().sum()/len(X))

Evaporation and Cloud3pm has 43 and 40 percent missing values. Sunshine also has 48 percent missing values and Cloud9am has 37 percent missing values. We'll impute these values later. What strategy to use for imputation depends on whether outliers exists ir not for that variable.

# Check whether the dependent variable is balanced or not

In [None]:
print(y.value_counts())
countplot(y['RainTomorrow'])

The dataset is highly imbalaced. Thus we'll split the data in a stratified manner.

# Perform Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=.25,stratify=y,random_state=99)

In [None]:
print(X_train.shape, X_valid.shape)
print(y_train.value_counts())

Clearly the stratification is maintained here also.

# Apply preprocessing 

In [None]:
#Apply preprocessing to the numeric variables

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer

for c in num_cols:
    im_n=SimpleImputer(strategy='median')
    X_train[c]=im_n.fit_transform(X_train[[c]])
    X_valid[c]=im_n.transform(X_valid[[c]])

for n in num_cols:
    qt=QuantileTransformer(output_distribution='normal',random_state=99)
    X_train[n]=qt.fit_transform(X_train[[n]])
    X_valid[n]=qt.transform(X_valid[[n]])
    
print(X_train.isna().sum())       

Clearly the missing values are imputed with median strategy

In [None]:
#Preprocessing for categorical variables
from sklearn.preprocessing import OrdinalEncoder

for c in cat_cols:
    im_c=SimpleImputer(strategy='most_frequent')
    X_train[c]=im_c.fit_transform(X_train[[c]])
    X_valid[c]=im_c.transform(X_valid[[c]])

for c in cat_cols:
    oe=OrdinalEncoder(dtype=int)
    X_train[c]=oe.fit_transform(X_train[[c]])
    X_valid[c]=oe.transform(X_valid[[c]])

In [None]:
print(X_train.head(n=6))

All the preprocessing is completed without any data leakage

> # Modeling using catBoostClassifier

In [None]:
from catboost import CatBoostClassifier,Pool

cat_features = [X_train.columns.get_loc(col) for col in cat_cols]
print(cat_features)

train_x=Pool(data=X_train,label=y_train,cat_features=cat_features)
valid_x=Pool(data=X_valid,label=y_valid,cat_features=cat_features)

params = {'loss_function':'Logloss',
          'learning_rate':0.03,
          'depth':7,
          'n_estimators':10000,
          'eval_metric':'AUC',
          'od_type': 'Iter',
          'od_wait':1000,
          'verbose':200,
          'one_hot_max_size':0,
          'class_weights':(1,3.4),
          'random_state':99
         }
#Fit the random forest learner
cb=CatBoostClassifier(**params)
cb.fit(train_x,eval_set=valid_x,use_best_model=True,plot=True)

# Simple Catboost model performs very well on the validation data. 