In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**DATA PREPROCESSING**

In [None]:
df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.iloc[0,:]

In [None]:
df.isna().sum()

**Dropping the Columns which are having Null values**

drop columns with empty data > 10%

In [None]:
#amount empty data
col_empty = df.apply(lambda x: f'{(x.isnull().sum()/df.shape[0]).round(2)} %').sort_values()
col_empty

In [None]:
# drop columns with empty data > 10%
df.drop(col_empty.index.to_list()[-4:], axis = 1, inplace = True)

In [None]:
df.columns.to_list()

In [None]:
def get_cols(df) -> list:
    '''
    function return list of name numbers and categorials columns
    '''
    categorical_feature_mask = df.dtypes == object
    number_feature_mask = df.dtypes != object
    numbers_cols = df.columns[number_feature_mask].tolist()
    categorical_cols = df.columns[categorical_feature_mask].tolist()
    return [numbers_cols, categorical_cols]

num_cols, cat_cols = get_cols(df)

In [None]:
# fill na data
from sklearn.impute import SimpleImputer


imp_mean_num = SimpleImputer(strategy='mean')
imp_mean_cat = SimpleImputer(strategy='most_frequent')

for col in df.columns.to_list():
    if col in num_cols:
        df[col] = imp_mean_num.fit_transform(df[[col]])
    else:
        df[col] = imp_mean_cat.fit_transform(df[[col]])

In [None]:
# check previous step
df.isnull().sum()

In [None]:
df.corr()['RainTomorrow'].abs().sort_values()

****Features****
* Date-Date of occurance
* Location-Where does it rain
* MinTemp-The lowest temperature recorded during the day          
* MaxTemp-The highest temperature recorded during the day          
* Rainfall-The numerics of rainfall in scalable format        
* Evaporation-The Evaporation of water in scalable format      
* Sunshine-The sunshine chances         
* WindGustDir-Wind Direction       
* WindGustSpeed-The speed of wind gust    
* WindDir9am-Wind direction during 9.00 am      
* WindDir3pm-Wind direction during 3.00 pm        
* WindSpeed9am-Wind direction during 9.00 am      
* WindSpeed3pm-Wind direction during 3.00 pm      
* Humidity9am-Humidity during 9.00 am      
* Humidity3pm-Humidity during 3.00 pm       
* Pressure9am-Air Pressure during 9.00 am      
* Pressure3pm-Air Pressure during 3.00 pm      
* Cloud9am-Clouds Visbility during 9.00 am         
* Cloud3pm-Cloud Visbility during 3.00 pm         
* Temp9am-Temperture during 9.00 am           
* Temp3pm-Temperture during 3.00 pm          
* RainToday-chance of rain during Today         
* RainTomorrow-chance of rain during tomorrow      

**Approaching Categorical Features**

Categorical variables/features are any feature type can be classified into two major types:

* Nominal
* Ordinal

Nominal variables are variables that have two or more categories which do not have any kind of order associated with them. For example, if gender is classified into two groups, i.e. male and female, it can be considered as a nominal variable.

Ordinal variables on the other hand, have “levels” or categories with a particular order associated with them. For example, an ordinal categorical variable can be a feature with three different levels: low, medium and high. Order is important.

**Feature Selection**

In [None]:
cor=df.corr()
plt.figure(figsize=(20,12))
sns.heatmap(cor,annot=True,cmap='coolwarm')
plt.show()



**Label Encoding** refers to converting the labels into numeric form so as to convert it into the machine-readable form. Machine learning algorithms can then decide in a better way on how those labels must be operated. It is an important pre-processing step for the structured dataset in supervised learning.
![](https://ekababisong.org/assets/seminar_IEEE/LabelEncoder.png)  

We can do label Encoding From LabelEncoder of scikit-Learn but to do so first we have to impute missing values in data 

In [None]:
from sklearn.preprocessing import LabelEncoder

df['RainTomorrow'].value_counts()

In [None]:
# initialize LabelEncoder
lbl_enc = LabelEncoder()

# fit label encoder and transform values on ord_2 column
df.loc[:, "RainTomorrow"] = lbl_enc.fit_transform(df['RainTomorrow'].values)

df['RainTomorrow'].value_counts()

In [None]:
df['Location'].value_counts()

In [None]:
# fit label encoder and transform values on ord_2 column
df.loc[:, "Location"] = lbl_enc.fit_transform(df['Location'].values)

df['Location'].value_counts()

In [None]:
df['WindGustDir'].value_counts()

In [None]:
# fit label encoder and transform values on ord_2 column
df.loc[:, "WindGustDir"] = lbl_enc.fit_transform(df['WindGustDir'].values)

df['WindGustDir'].value_counts()

In [None]:
df['WindDir9am'].value_counts()

In [None]:
# fit label encoder and transform values on ord_2 column
df.loc[:, "WindDir9am"] = lbl_enc.fit_transform(df['WindDir9am'].values)

df['WindDir9am'].value_counts()

In [None]:
df['WindDir3pm'].value_counts()

In [None]:
# fit label encoder and transform values on ord_2 column
df.loc[:, "WindDir3pm"] = lbl_enc.fit_transform(df['WindDir3pm'].values)

df['WindDir3pm'].value_counts()

In [None]:
df['RainToday'].value_counts()

In [None]:
# fit label encoder and transform values on ord_2 column
df.loc[:, "RainToday"] = lbl_enc.fit_transform(df['RainToday'].values)

df['RainToday'].value_counts()

In [None]:
df.drop(['Date'], axis = 1, inplace = True)

In [None]:
df.head()

Now we split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['RainTomorrow'], axis = 1)
y = df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Confusion matrix

The result is telling us that we have 35592+4609 correct predictions and 1775+6026 incorrect predictions.

In [None]:

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

**Compute precision, recall, F-measure and support**

To quote from Scikit Learn:

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.

The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.

The support is the number of occurrences of each class in y_test.

In [None]:
print(classification_report(y_test, y_pred))

**ROC Curve**

The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()