# Classification of Australian weather data using scikit-learn

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### Import weather data

In [None]:
data = pd.read_csv('../input/weatherAUS.csv')

In [None]:
data.head()

* **Date** The date of observation
* **Location** The common name of the location of the weather station
* **MinTemp** The minimum temperature in degrees celsius
* **MaxTemp** The maximum temperature in degrees celsius
* **Rainfall** The amount of rainfall recorded for the day in mm
* **Evaporation** The so-called Class A pan evaporation (mm) in the 24 hours to 9am
* **Sunshine** The number of hours of bright sunshine in the day.
* **WindGustDir** The direction of the strongest wind gust in the 24 hours to midnight
* **WindGustSpeed** The speed (km/h) of the strongest wind gust in the 24 hours to midnight
* **WindDir9am** Direction of the wind at 9am
* **WindDir3pm** Direction of the wind at 3pm
* **WindSpeed9am** Wind speed (km/hr) averaged over 10 minutes prior to 9am
* **WindSpeed3pm** Wind speed (km/hr) averaged over 10 minutes prior to 3pm
* **Humidity9am** Humidity (percent) at 9am
* **Humidity3pm** Humidity (percent) at 3pm
* **Pressure9am** Atmospheric pressure (hpa) reduced to mean sea level at 9am
* **Pressure3pm** Atmospheric pressure (hpa) reduced to mean sea level at 3pm
* **Cloud9am** Fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast.
* **Cloud3pm** Fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values
* **Temp9am** Temperature (degrees C) at 9am
* **Temp3pm** Temperature (degrees C) at 3pm
* **RainToday** Boolean: 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0
* **RISK_MM** The amount of rain. A kind of measure of the "risk".
* **RainTomorrow** The target variable. Did it rain tomorrow?

### Check for null values

In [None]:
data[data.isnull().any(axis=1)]

### Data Cleaning

In [None]:
del data['Date']

In [None]:
del data['Evaporation']

In [None]:
del data['Sunshine']

In [None]:
data.head()

In [None]:
del data['Location']

In [None]:
before_rows = data.shape[0]
data = data.dropna()
after_rows = data.shape[0]

In [None]:
before_rows

In [None]:
after_rows

### How many rows dropped?

In [None]:
before_rows - after_rows

In [None]:
clean_data = data.copy()
clean_data['RainTomorrow'] = clean_data['RainTomorrow'].map({'No':0, 'Yes':1})
clean_data['RainToday'] = clean_data['RainToday'].map({'No':0, 'Yes':1})
clean_data.head(10)

### Using morning features and rain to predict next day rain.

In [None]:
features = ['WindSpeed9am', 'Humidity9am', 'Pressure9am', 
            'Cloud9am', 'Temp9am', 'RainToday']

In [None]:
X = clean_data[features].copy()

In [None]:
y = clean_data['RainTomorrow'].copy()

### Test and train split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

In [None]:
y_train.describe()

### Fit on training set

In [None]:
rain_classifier = DecisionTreeClassifier(max_leaf_nodes=8, random_state=0)
rain_classifier.fit(X_train, y_train)

### Predict on Test Set

In [None]:
predictions = rain_classifier.predict(X_test)

In [None]:
predictions[:10]

In [None]:
y_test[:10]

### Measure accuracy of the classifier

In [None]:
accuracy_score(y_true = y_test, y_pred = predictions)