In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
data.head(13)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
n = data.isna().sum()
n = n.reset_index()
n

In [None]:
n['%'] = n[0]*100/len(data)
n

In [None]:
data['RainToday'] = data['RainToday'].replace("No",0).replace("Yes",1)
data['RainToday'].head(13)

In [None]:
data['RainTomorrow'] = data['RainTomorrow'].replace("No",0).replace("Yes",1)
data['RainTomorrow'].head(13)

# Preprocessing

> The below plot shows that our output data is biased for category 0 i.e. No. Upsampling should be done to make it equal

In [None]:
import seaborn as sns
sns.histplot(data,x="RainTomorrow")

**Dropping the rows which have output variable as NAN**

In [None]:
data = data.dropna(subset=["RainTomorrow"],axis=0)
data.shape
data.isna().sum()

In [None]:
objects_data = data.select_dtypes(include="object")
objects_data.info()

In [None]:
objects_data.nunique()
# objects_data = objects_data.fillna(objects_data.mode()[0]) 
# objects_data.isna().sum()

In [None]:
objects_data.Location.value_counts()

**Imputation**

In [None]:
data = data.iloc[:,1:]
data.head(40)

In [None]:
#Filling the NAN Values with mean and mode
#Mean if values are regressive and mode if values are categorical or discrete
data = data.fillna(data.mean())
# for col in data.columns:
#     data[col] = data[col].fillna(data[col].mean())
# data.isna().sum()
data.head(13)

In [None]:
data.isna().sum()

In [None]:
data['WindGustDir'] = data['WindGustDir'].fillna(data['WindGustDir'].mode()[0])
data['WindDir9am'] = data['WindDir9am'].fillna(data['WindDir9am'].mode()[0])
data['WindDir3pm'] = data['WindDir3pm'].fillna(data['WindDir3pm'].mode()[0])
data['WindDir3pm'].isna().sum()

In [None]:
data.isna().sum()

# **Encoding**

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['Location'] = label_encoder.fit_transform(data['Location'])
data['WindGustDir'] = label_encoder.fit_transform(data['WindGustDir'])
data['WindDir9am'] = label_encoder.fit_transform(data['WindDir9am'])
data['WindDir3pm'] = label_encoder.fit_transform(data['WindDir3pm'])
data.head()

# **Balance the Data using SMOTE**

In [None]:
y = data.iloc[:,-1]
y.head()

In [None]:
data = data.iloc[:,:-1]
data.head()

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
data, y = oversample.fit_resample(data, y)

In [None]:
from collections import Counter
c = Counter(y)
c

In [None]:
data.info()

# Correlation

In [None]:
import matplotlib.pyplot as plt
corr = data.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True,fmt = '.1f');

**If we look at the above heatmap it is evident that:**

* Temp9am Temp3pm are highly correlated with MinTemp and MaxTemp
* Humidity9am and humidity3pm are highly correlated


In [None]:
cols = ['Temp9am','Temp3pm','Humidity9am']
data = data.drop(['Temp9am','Temp3pm','Humidity9am'],axis=1)

In [None]:
data.columns

# **Model Training**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty = 'elasticnet',solver='saga', random_state=0,l1_ratio=0.4)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate':[0.01, 0.03, 0.05],
        'n_estimators': [100,300,600,700]
        }

In [None]:
#XGBoost
 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
xgb_model= xgb.XGBClassifier()
xgb_model.fit(X_train,y_train)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

In [None]:
random_search = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=42 )

# Uncomment the below line when you run the model
random_search.fit(X_train,y_train)


In [None]:
random_search.cv_results_
random_search.best_params_

In [None]:
xgb_random_pred = random_search.predict(X_test)
xgb_random_pred

In [None]:
xgb_pred = xgb_model.predict(X_test)
xgb_pred

# **Accuracy & Error Metrics**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, classification_report
precision, recall, fscore, support = score(y_test,y_pred,average='weighted')

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
lg = {
    'precision':precision,
    'recall':recall,
    'fscore':fscore,
    'support':support
}
lg

In [None]:
xgb_precision, xgb_recall, xgb_fscore, xgb_support = score(y_test,xgb_pred,average='weighted')

In [None]:
accuracy_score(xgb_pred,y_test)

In [None]:
xgb_= {
    'precision':xgb_precision,
    'recall':xgb_recall,
    'fscore':xgb_fscore,
    'support':xgb_support
}
xgb_

In [None]:
rs_precision, rs_recall, rs_fscore, rs_support = score(y_test,xgb_random_pred,average='weighted')

In [None]:
accuracy_score(xgb_random_pred,y_test)

In [None]:
rs_= {
    'precision':rs_precision,
    'recall':rs_recall,
    'fscore':rs_fscore,
    'support':rs_support
}
rs_