# Rain prediction (Australia)

Task: Binary Classification

# 1. Import the libraries and the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, \
r2_score, classification_report, roc_curve, auc
%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv", parse_dates = ['Date'])
df.head()

In [None]:
sns.pairplot(df[:1000])

# 2. Feature engineering

In [None]:
# Create the map for wind direction
r = np.pi/180
wind_dir_map = {
    'W':270*r, 'WNW':292.5*r, 'WSW':247.5*r, 'NE':45*r, 'NNW':337.5*r, 
    'N':0*r, 'NNE':22.5*r, 'SW':225*r, 'ENE':67.5*r,'SSE':157.5*r, 'S':180*r, 
    'NW':315*r, 'SE':135*r, 'ESE':112*r, 'E':90*r, 'SSW':202.5*r
}
# Numericalize the 'WindGustDir'
df['WindGustDir'] = df['WindGustDir'].map(wind_dir_map)
# Numericalize the 'WindDir9am'
df['WindDir9am'] = df['WindDir9am'].map(wind_dir_map)
# Numericalize the 'WindDir3pm'
df['WindDir3pm'] = df['WindDir3pm'].map(wind_dir_map)

# Create the map for rainfall
rain_map = {'Yes':int(1), 'No':int(0)}
# Numericalize the 'RainToday'
df['RainToday'] = df['RainToday'].map(rain_map)
# Numerialize the 'RainTomorrow'
df['RainTomorrow'] = df['RainTomorrow'].map(rain_map)

df.head()

# 3. Model evaluation

In [None]:
features_israiny = [
            'Pressure9am', 'Pressure3pm','Humidity9am',
            'Humidity3pm', 'Temp9am', 'Temp3pm'
           ]
negative_features_israiny = [
    'Evaporation','Sunshine','Cloud9am','Cloud3pm'
]

X = df.drop(negative_features_israiny, axis = 1).dropna()[features_israiny]
y = df.drop(negative_features_israiny, axis = 1).dropna()['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)

sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(random_state = 0)
}

scores = {}
plt.figure(figsize = [7,7])
for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    auc_value = auc(fpr, tpr)
    
    print('Performance of {0} is:\n'.format(model_name))
    print(classification_report(y_test, y_pred))
    
    plt.plot(fpr, tpr, label = 'ROC curve of {} (area = {:.3f})'.format(model_name, auc_value) )
    plt.plot([0,1], [0,1], color = 'black', linestyle = '--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc = 'best')