Import all necessary libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.ensemble import RandomForestClassifier




 Read .csv data

In [None]:
data = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
data.head()

Delete all Nan values from RainTomorrow column and convert it no numeric format

In [None]:
data = data[data['RainTomorrow'].notna()]
y = data['RainTomorrow']
d = {'Yes': 1, 'No': 0}
y = y.map(d)
data = data.drop(['RainTomorrow'], axis = 1)
data.info()

Create correlation matrix and delete columns with big correlation between each other. Also delete column with a large number of Nan values(Sunshine) and Date/Location columns.

In [None]:
numerical = list(set(data.columns))
corr_matrix = data[numerical].corr().unstack()
strong_pairs = corr_matrix[((corr_matrix) > 0.8) & ((corr_matrix) < 1.0)]
X = data.drop(['MinTemp','Pressure9am','Temp9am','Sunshine','Date','Location'], axis = 1)
strong_pairs

Replace Nan values in object columns with mode value and transform object type to numeric using OrdinalEncoder. Also replace Nan values in numeric columns with mean value.

In [None]:
for column in X.select_dtypes(include = 'object').columns:
    X[column].fillna(X[column].mode()[0], inplace=True)
for column in X.select_dtypes(exclude = 'object').columns:
    X[column].fillna(X[column].mean(), inplace=True)
encoder = ce.OrdinalEncoder(cols=X.select_dtypes(include = 'object').columns)
X = encoder.fit_transform(X)
X.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
params = [{'max_depth':range(1,10), 'min_samples_split':range(20,30)}]
rfc = RandomForestClassifier(n_estimators=15, random_state=42, n_jobs=-1, oob_score=True)
gcv = GridSearchCV(rfc, params, n_jobs=-1, cv=5, verbose=1)
gcv.fit(X_train, y_train)
best_grid = gcv.best_estimator_
random_predict = best_grid.predict(X_test)
print(mean_squared_error(y_test, random_predict))
accuracy_score(y_test, random_predict)