In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest

import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Get path to file

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data reading

In [None]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.info()

**Above we can see incomplete data in some columns**  
**Okay, that's normal**

In [None]:
df['Date'] = pd.to_datetime(df['Date']) # let's put the date data into a date and time object

In [None]:
df.set_index('Date', inplace=True)

In [None]:
df.head()

In [None]:
float_data = list(filter(lambda x: df[x].dtypes !='object', df.columns))
object_data = list(filter(lambda x: df[x].dtypes !='float64', df.columns))

## Here I have written a few functions that will help me further

In [None]:
def get_not_nan_value(table):
    return table.fillna(value=table.mean())


def get_kde_x_not_nan_value(table: pd.Series):
    """Return KDE and X: massive."""
    without_nan = get_not_nan_value(table)
    kde = gaussian_kde(without_nan)
    x = np.linspace(without_nan.min(), without_nan.max(), 100)
    return kde, x, without_nan


def draw_hist_and_density(kde, x, value,
                               title_text='', legend_text=None,
                               x_text='', y_text=''):
    """This function draw hist and density."""
    plt.plot(x, kde(x), color='g')
    plt.hist(value, density=True)
    plt.title(title_text)
    plt.legend(legend_text) if legend_text is not None else None
    plt.xlabel(x_text)
    plt.ylabel(y_text)


def draw_hist(value, title_text='',
              x_text='', y_text=''):
    """This function draw hist."""
    plt.hist(value, density=True)
    plt.title(title_text)
    plt.xlabel(x_text)
    plt.ylabel(y_text)

    
def draw_scatter_plot(x_, y_, alpha_, title_text='', 
                      x_text='', y_text=''):
    """This function draw dependencies."""
    plt.scatter(x=x_, y=y_, alpha=alpha_)
    plt.title(title_text)
    plt.xlabel(x_text)
    plt.ylabel(y_text)

# Now I will try with the help of visualization to bring us closer to the decision of choice of features for teaching the model

In [None]:
plt.figure(figsize=(12, 7))

kde_min, x_min, min_temp = get_kde_x_not_nan_value(df['MinTemp'])
kde_max, x_max, max_temp = get_kde_x_not_nan_value(df['MaxTemp'])
kde_rain, x_rain, rain = get_kde_x_not_nan_value(df['Rainfall'])
kde_evp, x_evp, evaporation = get_kde_x_not_nan_value(df['Evaporation'])

plt.subplot(2, 2, 1)
draw_hist_and_density(kde=kde_min, x=x_min, value=min_temp,
                           title_text='MinTemp',
                           legend_text=[
                               'distribution density',
                               '$t_{min}$ distribution'
                           ])
plt.subplot(2, 2, 2)
draw_hist_and_density(kde=kde_max, x=x_max, value=max_temp,
                           title_text='MaxTemp',
                           legend_text=[
                               'distribution density',
                               '$t_{max}$ distribution'
                           ])

plt.subplot(2, 2, 3)
draw_hist_and_density(kde=kde_rain, x=x_rain, value=rain,
                      title_text='Rainfall',
                      legend_text=[
                          'distribution density',
                          'Rainfall distribution'
                      ])

plt.subplot(2, 2, 4)
draw_hist_and_density(kde=kde_evp, x=x_evp, value=evaporation,
                           title_text='Evaporation',
                           legend_text=[
                               'distribution density',
                               'Evaporation distribution'
                           ])

In [None]:
plt.figure(figsize=(12,7))
plt.subplot(2,2,1)
draw_scatter_plot(x_=min_temp, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the MinTemp", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,2)
draw_scatter_plot(x_=max_temp, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the MaxTemp", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,3)
draw_scatter_plot(x_=rain, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Rainfal", 
                  x_text='Rainfal', y_text='Rainfall')
plt.subplot(2,2,4)
draw_scatter_plot(x_=evaporation, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Evaporation", 
                  x_text='Evaporation', y_text='Rainfall')

In [None]:
plt.figure(figsize=(12, 7))
kde_sunshine, x_sunshine, sunshine = get_kde_x_not_nan_value(df['Sunshine'])
kde_wind, x_wind, wind_gust_speed = get_kde_x_not_nan_value(df['WindGustSpeed'])
kde_winds_am, x_winds_am, wind_speed_am = get_kde_x_not_nan_value(df['WindSpeed9am'])
kde_winds_pm, x_winds_pm, wind_speed_pm = get_kde_x_not_nan_value(df['WindSpeed3pm'])

plt.subplot(2, 2, 1)
draw_hist_and_density(kde=kde_sunshine, x=x_sunshine, value=sunshine,
                      title_text='Sunshine',
                      legend_text=[
                          'distribution density',
                          'Sunshine distribution'
                      ])

plt.subplot(2, 2, 2)
draw_hist_and_density(kde=kde_wind, x=x_wind, value=wind_gust_speed,
                           title_text='WindGustSpeed',
                           legend_text=[
                               'distribution density',
                               'WindGustSpeed distribution'
                           ])

plt.subplot(2, 2, 3)
draw_hist_and_density(kde=kde_winds_am, x=x_winds_am, value=wind_speed_am,
                           title_text='WindSpeed9am',
                           legend_text=[
                               'distribution density',
                               'WindSpeed9am distribution'
                           ])

plt.subplot(2, 2, 4)
draw_hist_and_density(kde=kde_winds_pm, x=x_winds_pm, value=wind_speed_pm,
                           title_text='WindSpeed3pm',
                           legend_text=[
                               'distribution density',
                               'WindSpeed3pm distribution'
                           ])

In [None]:
plt.figure(figsize=(12,7))
plt.subplot(2,2,1)
draw_scatter_plot(x_=sunshine, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Sunshine", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,2)
draw_scatter_plot(x_=wind_gust_speed, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the WindGustSpeed", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,3)
draw_scatter_plot(x_=wind_speed_am, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the WindSpeed9am", 
                  x_text='WindSpeed9am', y_text='Rainfall')
plt.subplot(2,2,4)
draw_scatter_plot(x_=wind_speed_pm, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the WindSpeed3pm", 
                  x_text='WindSpeed3pm', y_text='Rainfall')

In [None]:
plt.figure(figsize=(12, 7))
kde_hum_am, x_hum_am, humidity_am = get_kde_x_not_nan_value(df['Humidity9am'])
kde_hum_pm, x_hum_pm, humidity_pm = get_kde_x_not_nan_value(df['Humidity3pm'])
kde_pres_am, x_pres_am, pressure_am = get_kde_x_not_nan_value(df['Pressure9am'])
kde_pres_pm, x_pres_pm, pressure_pm = get_kde_x_not_nan_value(df['Pressure3pm'])

plt.subplot(2, 2, 1)
draw_hist_and_density(kde=kde_hum_am, x=x_hum_am, value=humidity_am,
                           title_text='Humidity9am',
                           legend_text=[
                               'distribution density',
                               'Humidity9am distribution'
                           ])

plt.subplot(2, 2, 2)
draw_hist_and_density(kde=kde_hum_pm, x=x_hum_pm, value=humidity_pm,
                           title_text='Humidity3pm',
                           legend_text=[
                               'Humidity3pm density',
                               'WindSpeed3pm distribution'
                           ])

plt.subplot(2, 2, 3)
draw_hist_and_density(kde=kde_pres_am, x=x_pres_am, value=pressure_am,
                           title_text='Pressure9am',
                           legend_text=[
                               'distribution density',
                               'Pressure9am distribution'
                           ])

plt.subplot(2, 2, 4)
draw_hist_and_density(kde=kde_pres_pm, x=x_pres_pm, value=pressure_pm,
                           title_text='Pressure3pm',
                           legend_text=[
                               'Pressure3pm density',
                               'Pressure3pm distribution'
                           ])

In [None]:
plt.figure(figsize=(12,7))
plt.subplot(2,2,1)
draw_scatter_plot(x_=humidity_am, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Humidity9am", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,2)
draw_scatter_plot(x_=humidity_pm, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Humidity3pm", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,3)
draw_scatter_plot(x_=pressure_am, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Pressure9am", 
                  x_text='Pressure9am', y_text='Rainfall')
plt.subplot(2,2,4)
draw_scatter_plot(x_=pressure_pm, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Pressure3pm", 
                  x_text='Pressure3pm', y_text='Rainfall')

In [None]:
plt.figure(figsize=(12, 7))
kde_cloud_am, x_cloud_am, cloud_am = get_kde_x_not_nan_value(df['Cloud9am'])
kde_cloud_pm, x_cloud_pm, cloud_pm = get_kde_x_not_nan_value(df['Cloud3pm'])
kde_temp_am, x_temp_am, temp_am = get_kde_x_not_nan_value(df['Temp9am'])
kde_temp_pm, x_temp_pm, temp_pm = get_kde_x_not_nan_value(df['Temp3pm'])

plt.subplot(2, 2, 1)
draw_hist_and_density(kde=kde_cloud_am, x=x_cloud_am, value=cloud_am,
                           title_text='Cloud9am',
                           legend_text=[
                               'distribution density',
                               'Cloud9am distribution'
                           ])

plt.subplot(2, 2, 2)
draw_hist_and_density(kde=kde_cloud_pm, x=x_cloud_pm, value=cloud_pm,
                           title_text='Cloud3pm',
                           legend_text=[
                               'distribution density',
                               'Cloud3pm distribution'
                           ])

plt.subplot(2, 2, 3)
draw_hist_and_density(kde=kde_temp_am, x=x_temp_am, value=temp_am,
                           title_text='Temp9am',
                           legend_text=[
                               'distribution density',
                               'Temp9am distribution'
                           ])

plt.subplot(2, 2, 4)
draw_hist_and_density(kde=kde_temp_pm, x=x_temp_pm, value=temp_pm,
                           title_text='Temp3pm',
                           legend_text=[
                               'distribution density',
                               'Temp3pm distribution'
                           ])

In [None]:
plt.figure(figsize=(12,7))
plt.subplot(2,2,1)
draw_scatter_plot(x_=cloud_am, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Cloud9am", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,2)
draw_scatter_plot(x_=cloud_pm, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Cloud3pm", 
                  x_text='', y_text='Rainfall')
plt.subplot(2,2,3)
draw_scatter_plot(x_=temp_am, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Temp9am", 
                  x_text='Temp9am', y_text='Rainfall')
plt.subplot(2,2,4)
draw_scatter_plot(x_=temp_pm, y_=rain, alpha_=0.02, 
                  title_text="Rain's dependence on the Temp3pm", 
                  x_text='Temp3pm', y_text='Rainfall')

**As you may have noticed, I'm not very good at making charts yet, and I certainly can't derive any benefit from these charts.  
We can say that we will need data such as:**
* MinTemp 
* MaxTemp 
* Rainfall 
* WindGustSpeed
* WindSpeed9am
* WindSpeed3pm 
* Pressure9am
* Pressure3pm
* Temp9am 
* Temp3pm  

**As well as the object data to be classified, and make them into numerical data:**
* Location
* RainTomorrow

In [None]:
cols_need = [
    'Location', 'MinTemp', 'MaxTemp',
    'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
    'WindSpeed3pm', 'Pressure9am', 'Pressure3pm',
    'Temp9am', 'Temp3pm', 'RainTomorrow'
]

In [None]:
new_df = df[cols_need]

In [None]:
new_df.info()
new_df.isna().sum()

In [None]:
new_df = new_df.dropna()
new_df.info()
new_df.isna().sum()

In [None]:
class_le = LabelEncoder()
new_df['RainTomorrow'] = class_le.fit_transform(new_df['RainTomorrow'].values)
new_df['Location'] = class_le.fit_transform(new_df['Location'].values)

In [None]:
new_df.info()

In [None]:
new_df.head()

## Split data

In [None]:
X = new_df.iloc[:, :-1]
y = new_df.iloc[:, -1:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3)

# LogisticRegression

In [None]:
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)

In [None]:
lr_head = LR.predict(X_test)
print(f"""
accuracy_score: {accuracy_score(lr_head, y_test)}
roc_auc_score: {roc_auc_score(lr_head, y_test)}
""")

# DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

In [None]:
tree_head = tree.predict(X_test)
print(f"""
accuracy_score: {accuracy_score(tree_head, y_test)}
roc_auc_score: {roc_auc_score(tree_head, y_test)}
""")

# KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
knn_head = knn.predict(X_test)
print(f"""
accuracy_score: {accuracy_score(knn_head, y_test)}
roc_auc_score: {roc_auc_score(knn_head, y_test)}
""")

**Not bad, but as you can see, the best accuracy here is the usual logistic regression.  
I think you should write a function to find the optimal parameters for the decision tree and the nearest neighbors**
### Let's start writing the function


In [None]:
def found_good_neighbors_1(n, p):
    knn = KNeighborsClassifier(n_neighbors=n, p=p, 
                               metric='minkowski')
    knn.fit(X_train, y_train)
    return knn.score(X_test, y_test)

def found_goot_depth(n, criterion_):
    tree = DecisionTreeClassifier(max_depth=n, 
                                  criterion=criterion_,
                                  random_state=42)
    tree.fit(X_train, y_train)
    return tree.score(X_test, y_test)

In [None]:
knn_1 = [found_good_neighbors_1(n, 1) for n in range(1, 22, 2)]
knn_2 = [found_good_neighbors_1(n, 2) for n in range(1, 22, 2)]

In [None]:
tree_gini = [found_goot_depth(n, 'gini') for n in range(1, 22, 2)]
tree_entropy = [found_goot_depth(n, 'entropy') for n in range(1, 22, 2)]

In [None]:
plt.figure(figsize=(12, 7))
plt.subplot(2, 2, 1)
plt.plot(tree_gini)
plt.title('tree_gini')
plt.legend(['score'])
plt.subplot(2, 2, 2)
plt.plot(tree_entropy)
plt.title('tree_entropy')
plt.legend(['score'])
plt.subplot(2, 2, 3)
plt.plot(knn_1)
plt.title('knn_1')
plt.legend(['score'])
plt.subplot(2, 2, 4)
plt.plot(knn_2)
plt.title('knn_2')
plt.legend(['score'])
plt.show()

In [None]:
print(f"""
tree_gini: {max(tree_gini)}
tree_entropy: {max(tree_entropy)}
knn_1: {max(knn_1)}
knn_2: {max(knn_2)}
""")

**As we can see the decisive trees begin to fall at a depth of 4-5.  
What we cannot say about the nearest-neighbor method.  
I think we should still do tests starting from 20 to 50 in increments of 3 for nearest neighbours**

In [None]:
knn_1 = [found_good_neighbors_1(n, 1) for n in range(20, 51, 3)]
knn_2 = [found_good_neighbors_1(n, 2) for n in range(20, 51, 3)]

In [None]:
plt.figure(figsize=(14, 9))
plt.subplot(2,2,1)
plt.plot(knn_1)
plt.title('knn_1')
plt.legend(['score'])
plt.subplot(2, 2, 2)
plt.plot(knn_2)
plt.title('knn_2')
plt.legend(['score'])
plt.show()

In [None]:
print(f"""
knn_1: {max(knn_1)}
knn_2: {max(knn_2)}
""")

**You can notice not very nice graphs, but still knn_1 with such a parameter as 23 is suitable here, as it gives a good accuracy  
Although it gives an accuracy slightly higher than the logistic regression**

# RandomForestClassifier
### This method was not part of my demonstration plans, but it can still be mentioned, as you can see, its accuracy is much higher than the previous methods, even without parameter fitting.

In [None]:
rfc = RandomForestClassifier(random_state=1)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

## Use PipeLine, GridSearchCV and SelectKBest

In [None]:
from sklearn.pipeline import Pipeline
selector = SelectKBest(k=5)
rfc = RandomForestClassifier()
pipe = Pipeline(steps=[('selector', selector), ('rfc', rfc)])

parameters = {'rfc__n_estimators':[13, 25, 50, 102, 124], 
              'rfc__max_depth': [5, 7, 18, 47, 100],
              'rfc__min_samples_split': [1, 2, 3],
              'rfc__random_state': [5, 23, 38, 42]}
g_search = GridSearchCV(pipe, parameters, n_jobs=-1)

In [None]:
best_clf = g_search.fit(X_train, y_train)
best_clf.score(X_test, y_test)

In [None]:
best_clf.best_estimator_

In [None]:
y_pred = best_clf.predict(X_test)
accuracy_score(y_pred, y_test)

# Best accuracy in my testing:
1. RandomForestClassifier
2. KNeighborsClassifier
3. LogisticRegression
4. RandomForestClassifier + SelectKBest + GridSearchCV
5. DecisionTreeClassifier

#### This is my top 5 models.
**Now a little bit about my research and training. Maybe my choice of features was not very good, but the models were trained at a pretty good level, but far from perfect, this work is my first work on this platform, so I would be glad to know your opinion about the work done, also it would be good if you advise something that will help me grow as a specialist in the future. Good Luck!**

# Thanks for reading. Don't forget to upvote the work. Good luck kaggling!