In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv", parse_dates = ['Date'])
print(df.shape)
df.head()

In [None]:
df.describe(include = 'all').T

In [None]:
df.info()

In [None]:
#amount empty data
col_empty = df.apply(lambda x: f'{(x.isnull().sum()/df.shape[0]).round(2)} %').sort_values()
col_empty

In [None]:
# drop columns with empty data > 10%
df.drop(col_empty.index.to_list()[-4:], axis = 1, inplace = True)

In [None]:
#check columns
df.columns.to_list()

In [None]:
df.iloc[0,:]

In [None]:
# add new columns 
def get_season(n):
    if n in  [12,1,2]: return 1
    elif n in [3,4,5]:  return 2
    elif n in [6,7,8]:  return 3
    else: return 4
    

df['month'] = df['Date'].dt.month
df['Season'] = df['month'].apply(lambda x: get_season(x))
df['delta_temp'] = df['MaxTemp'] - df['MinTemp']
df['new_wind'] = np.sqrt(df['WindSpeed3pm'] * df['WindGustSpeed'] * df['WindSpeed9am'])
df['new_humidity'] = np.sqrt(df['Humidity9am'] * df['Humidity3pm'])
df['new_pressure'] = np.sqrt(df['Pressure9am'] * df['Pressure3pm'])
 

In [None]:
df.columns.shape # all correct

In [None]:
df[[ 'RainTomorrow']].value_counts()

In [None]:
def get_cols(df) -> list:
    '''
    function return list of name numbers and categorials columns
    '''
    categorical_feature_mask = df.dtypes == object
    number_feature_mask = df.dtypes != object
    numbers_cols = df.columns[number_feature_mask].tolist()
    categorical_cols = df.columns[categorical_feature_mask].tolist()
    return [numbers_cols, categorical_cols]

num_cols, cat_cols = get_cols(df)

In [None]:
# fill na data
from sklearn.impute import SimpleImputer


imp_mean_num = SimpleImputer(strategy='mean')
imp_mean_cat = SimpleImputer(strategy='most_frequent')

for col in df.columns.to_list():
    if col in num_cols:
        df[col] = imp_mean_num.fit_transform(df[[col]])
    else:
        df[col] = imp_mean_cat.fit_transform(df[[col]])

In [None]:
# check previous step
df.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
df[['RainToday', 'RainTomorrow']] = df[['RainToday', 'RainTomorrow']].apply(lambda x: le.fit_transform(x))

In [None]:
df.info()

In [None]:
# show correlations
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (15,6))
sns.heatmap(df.corr(), annot = True)


In [None]:
cols = df[num_cols].columns.to_list()
dt = df[num_cols]

for i,v in enumerate(cols):
    for t in range(i, len(cols)):
        if v != cols[t]:
            if dt.corr()[v][cols[t]] > 0.85:
                print(v, cols[t], dt.corr()[v][cols[t]].round(2))

In [None]:
#delete columns 
df.drop(['Temp9am', 'Temp3pm', 'new_humidity', 'Pressure3pm', 'new_pressure'], axis = 1, inplace = True)

In [None]:
df.corr()['RainTomorrow'].abs().sort_values()

**New column have a good correlation with target, drop unnecessary columns (corr < 0.1 with target) **

In [None]:
df.drop(['month', 'Date', 'Season', 'MinTemp', 'WindSpeed3pm', 'WindSpeed9am'], axis = 1, inplace = True)

In [None]:
df.head()

**Try to split WindGustDir and WindDir9am **

In [None]:
df['WindGust_W'] = [1 if 'W' in list(i) else 0 for i in df['WindGustDir']]
df['WindGust_N'] = [1 if 'N' in list(i) else 0 for i in df['WindGustDir']]
df['WindGust_S'] = [1 if 'S' in list(i) else 0 for i in df['WindGustDir']]
df['WindGust_E'] = [1 if 'E' in list(i) else 0 for i in df['WindGustDir']]

df['WindDir9am_W'] = [1 if 'W' in list(i) else 0 for i in df['WindDir9am']]
df['WindDir9am_N'] = [1 if 'N' in list(i) else 0 for i in df['WindDir9am']]
df['WindDir9am_S'] = [1 if 'S' in list(i) else 0 for i in df['WindDir9am']]
df['WindDir9am_E'] = [1 if 'E' in list(i) else 0 for i in df['WindDir9am']]

df['WindDir3pm_W'] = [1 if 'W' in list(i) else 0 for i in df['WindDir3pm']]
df['WindDir3pm_N'] = [1 if 'W' in list(i) else 0 for i in df['WindDir3pm']]
df['WindDir3pm_S'] = [1 if 'W' in list(i) else 0 for i in df['WindDir3pm']]
df['WindDir3pm_E'] = [1 if 'W' in list(i) else 0 for i in df['WindDir3pm']]

In [None]:
plt.figure(figsize = (15,6))
sns.heatmap(df.corr(), annot = True)

In [None]:
num_cols, cat_cols = get_cols(df)

cols = df[num_cols].columns.to_list()
dt = df[num_cols]

for i,v in enumerate(cols):
    for t in range(i, len(cols)):
        if v != cols[t]:
            if dt.corr()[v][cols[t]] > 0.85:
                print(v, cols[t], dt.corr()[v][cols[t]].round(2))

In [None]:
#Drop unnecessary columns
df.drop(['WindDir3pm_N', 'WindDir3pm_S', 'WindDir3pm_E'], axis = 1, inplace = True)

In [None]:
plt.figure(figsize = (15,6))
sns.heatmap(df.corr(), annot = True)

In [None]:
df.corr()['RainTomorrow'].abs().sort_values()

**unfortunately, our hypothesis turned out to be incorrect, so we delete the columns (except WindGust_E)**

In [None]:
df.drop(['WindGust_S', 'WindGust_N', 'WindDir3pm_W', 
         'WindDir9am_S', 'WindDir9am_N', 'WindDir9am_W',
         'WindDir9am_E', 'WindGust_W'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
# work with categorial data
num_cols, cat_cols = get_cols(df)

enc = LabelEncoder()
df[cat_cols] = df[cat_cols].apply(lambda x: le.fit_transform(x))

In [None]:
# work with number data
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
df[num_cols] = df[num_cols].apply(lambda x: le.fit_transform(x))

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['RainTomorrow'], axis = 1)
y = df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = df['RainTomorrow'])

# ****KNeighborsClassifier****

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV


model = KNeighborsClassifier(n_neighbors = 20, weights = 'distance')
model.fit(X_train,y_train)

accuracy_score(y_test, model.predict(X_test))

### KNN best score = 81,8%

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

accuracy_score(y_test, model.predict(X_test))

In [None]:
params ={
    'C': np.logspace(-2,2,10),
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga']
}

grid = GridSearchCV(LogisticRegression(random_state = 0), params, scoring = 'accuracy')
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
accuracy_score(y_test, grid.predict(X_test))

### LogisticRegression best score = 83,9%

# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier


params ={
    'criterion': ['entropy'],
    'max_depth': [16],
    'n_estimators': range(10,101,10)
}

grid = GridSearchCV(RandomForestClassifier(random_state = 0), params, scoring = 'accuracy')
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
accuracy_score(y_test, grid.predict(X_test))

### RandomForestClassifier best score = 84,8%

# Conclusion

# The best model - RandomForestClassifier 