# Introduction
* This dataset contains about 10 years of daily weather observations from many locations across Australia.
* RainTomorrow is the target variable to predict. It means -- did it rain the next day, Yes or No? This column is Yes if the rain for that day was 1mm or more.

<font color='blue'>
Content:

1. [Import Libraries](#1)  
2. [Import Dataset](#2)
3. [Exploratory data analysis](#3)
4. [Variable Analysis](#4)
    * [Categorical Variable](#5)
    * [Numeric Variable](#6)
5. [Missing Value](#7)
6. [Outlier Detection](#8)
7. [Feature Engineering](#9)
8. [Feature Selection](#10)
9. [ML Model](#11)

<a id="1"></a>
# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="2"></a>
# Import Dataset

In [None]:
data = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")

<a id="3"></a>
# Exploratory Data Analysis

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().T

<a id="4"></a>
# Variable Analysis

<a id="5"></a>
## Categorical Variable

In [None]:
data.dropna(subset = ["RainTomorrow"], inplace = True)

In [None]:
data["RainTomorrow"].replace({'No': 0, 'Yes': 1},inplace = True)

In [None]:
data["RainTomorrow"] = data.RainTomorrow.astype("int64")

In [None]:
def categorical(columns):
    plt.figure(figsize = (20,7))
    plt.subplot(1, 2, 1)
    plt.pie(x = data[columns].value_counts(), labels = data[columns].value_counts().index, autopct="%.1f%%", pctdistance =0.7)
    plt.subplot(1, 2, 2)
    sns.countplot(x = columns, data = data, hue = "RainTomorrow")

### WindGustDir

In [None]:
categorical("WindGustDir")

### WindDir9am

In [None]:
categorical("WindDir9am")

### WindDir3pm

In [None]:
categorical("WindDir3pm")

### RainToday

In [None]:
categorical("RainToday")

<a id="6"></a>
## Numeric Variable

In [None]:
corr_matrix = data.corr()
sns.clustermap(corr_matrix, annot = True, fmt = ".2f")
plt.title("Correlation between Features")
plt.show()

In [None]:
def numeric(columns, discrete = False):
    if discrete : 
        plt.figure(figsize = (15,7))
        sns.countplot(x = columns, data = data, hue = "RainTomorrow")
        plt.show()
    else :
        plt.figure(figsize = (15,7))
        sns.jointplot(x = columns, y = "RainTomorrow", data = data, kind = "reg")

### Cloud3pm

In [None]:
numeric("Cloud3pm", True)

### Cloud9am

In [None]:
numeric("Cloud9am", True)

### Humidity3pm

In [None]:
numeric("Humidity3pm")

### Sunshine

In [None]:
numeric("Sunshine")

### WindGustSpeed

In [None]:
numeric("WindGustSpeed")

### Humidity9am

In [None]:
numeric("Humidity9am")

### Pressure9am

In [None]:
numeric("Pressure9am")

### Pressure3pm

In [None]:
numeric("Pressure3pm")

### Rainfall

In [None]:
numeric("Rainfall")

<a id="7"></a>
# Missing Value

In [None]:
data.isnull().sum().sort_values()

### MaxTemp

In [None]:
maxtemp = list(data[data["MaxTemp"].isnull()].index)

In [None]:
data["Date"] = data.Date.astype('datetime64[ns]')
data['month'] = pd.DatetimeIndex(data['Date']).month
data.groupby(["month"])["MaxTemp"].mean()

In [None]:
for i in maxtemp:
    if data["month"][i] == 1:
        data["MaxTemp"][i] = 30
    elif data["month"][i] == 2:
        data["MaxTemp"][i] = 29
    elif data["month"][i] == 3:
        data["MaxTemp"][i] = 27
    elif data["month"][i] == 4:
        data["MaxTemp"][i] = 24
    elif data["month"][i] == 5:
        data["MaxTemp"][i] = 20
    elif data["month"][i] == 6:
        data["MaxTemp"][i] = 17
    elif data["month"][i] == 7:
        data["MaxTemp"][i] = 17
    elif data["month"][i] == 8:
        data["MaxTemp"][i] = 18
    elif data["month"][i] == 9:
        data["MaxTemp"][i] = 21
    elif data["month"][i] == 10:
        data["MaxTemp"][i] = 24
    elif data["month"][i] == 11:
        data["MaxTemp"][i] = 26
    else:
        data["MaxTemp"][i] = 28

### MinTemp

In [None]:
mintemp = list(data[data["MinTemp"].isnull()].index)

In [None]:
data.groupby(["month"])["MinTemp"].mean()

In [None]:
for i in mintemp:
    if data["month"][i] == 1:
        data["MinTemp"][i] = 18
    elif data["month"][i] == 2:
        data["MinTemp"][i] = 18
    elif data["month"][i] == 3:
        data["MinTemp"][i] = 16
    elif data["month"][i] == 4:
        data["MinTemp"][i] = 13
    elif data["month"][i] == 5:
        data["MinTemp"][i] = 10
    elif data["month"][i] == 6:
        data["MinTemp"][i] = 8
    elif data["month"][i] == 7:
        data["MinTemp"][i] = 7
    elif data["month"][i] == 8:
        data["MinTemp"][i] = 7
    elif data["month"][i] == 9:
        data["MinTemp"][i] = 9
    elif data["month"][i] == 10:
        data["MinTemp"][i] = 12
    elif data["month"][i] == 11:
        data["MinTemp"][i] = 14
    else:
        data["MinTemp"][i] = 16

### The other categorical variables

In [None]:
list_ = ["WindGustDir", "RainToday", "WindDir3pm", "WindDir9am", "Cloud9am", "Cloud3pm"]

In [None]:
for i in list_:
    data[i].fillna(data[i].mode()[0], inplace = True)

### The other numeric variables

In [None]:
numeric_variable = data.select_dtypes(["int64", "float64"])

for i in numeric_variable.columns:
    data[i].fillna(data[i].mean(), inplace = True)

In [None]:
data.isnull().sum()

<a id="8"></a>
# Outlier Detection

In [None]:
from collections import Counter
def outliers(data,columns):
    aykiri_indexler=[]
    for i in columns:
        Q1 = data[i].quantile(0.25)
        Q3 = data[i].quantile(0.75)
        IQR = Q3 -Q1
        alt_sinir = Q1 - 1.5*IQR
        ust_sinir = Q3 + 1.5*IQR
        filtre = ((data[i] < alt_sinir) | (data[i] > ust_sinir))
        aykiri_gozlemler = data[i][filtre]
        aykiri_index = aykiri_gozlemler.index
        aykiri_indexler.extend(aykiri_index)
        
    aykiri_indexler = Counter(aykiri_indexler)
    ortak_indexler = [i for i,v in aykiri_indexler.items() if v>2]
    return ortak_indexler

In [None]:
data.shape

In [None]:
x = data.select_dtypes(["float64", "int64"])
delete_index = outliers(data,x.columns)
data = data.drop(delete_index,axis=0).reset_index(drop=True)

In [None]:
data.shape

<a id="9"></a>
# Feature Engineering

In [None]:
data.drop(["Date", "month", "Location"], axis = 1, inplace = True)

In [None]:
list_ = ["Cloud9am", "Cloud3pm"]

for i in list_:
    data[i] = data[i].astype(str)

In [None]:
data = pd.get_dummies(data)

In [None]:
data.drop(["RainToday_No"], axis = 1, inplace = True)

In [None]:
data.columns

<a id="10"></a>
# Feature Selection

In [None]:
df = data.copy()

In [None]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index = df.index, columns = df.columns)
df

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
x = df.drop(["RainTomorrow"], axis = 1)
y = df[['RainTomorrow']]
selector = SelectKBest(chi2, k=20)
selector.fit(x, y)
x_new = selector.transform(x)
print(x.columns[selector.get_support(indices=True)])

In [None]:
columns = x.columns[selector.get_support(indices=True)]

<a id="11"></a>
# Machine Learning Model

In [None]:
data_new = data[columns]
data_new

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [None]:
x = data_new.values
y = data.RainTomorrow.values.reshape(-1,1)

In [None]:
#Let's see weather our dataset is balanced or imbalanced
sns.countplot(data.RainTomorrow)
plt.show()

We can see that our dataset is imbalanced dataset.We are going to use smote technique to deal with our imablanced dataset

In [None]:
smote = SMOTE(random_state = 42)
x_smote, y_smote = smote.fit_resample(x,y)

In [None]:
sns.countplot(y_smote)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size = 0.2 ,random_state = 42)

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Light GBM Classification

In [None]:
import lightgbm as lgb
lgbc = lgb.LGBMClassifier(random_state = 42)
model = lgbc.fit(x_train, y_train)

In [None]:
model.score(x_train, y_train)

In [None]:
y_head = model.predict(x_test)

In [None]:
print("Test accuracy :", accuracy_score(y_test, y_head))

In [None]:
print(classification_report(y_test, y_head))