In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# Load the data
data=pd.read_csv('weather_classification_data.csv')

In [3]:
# View Dimensions of Dataset
data.shape

(13200, 11)

In [4]:
# Preview the dataset
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [5]:
# Count of labels for targeted column
data['Weather Type'].value_counts()

Weather Type
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64

In [7]:
# The target column
X = data.drop(columns=['Weather Type'])  
y = data['Weather Type']

In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
# View dimensions of train and test set
X_train.shape, X_test.shape

((9240, 10), (3960, 10))

In [13]:
# Check data types in X_train
X_train.dtypes

Temperature             float64
Humidity                  int64
Wind Speed              float64
Precipitation (%)       float64
Cloud Cover              object
Atmospheric Pressure    float64
UV Index                  int64
Season                   object
Visibility (km)         float64
Location                 object
dtype: object

In [15]:
# Encode features
columns_list = data.columns.tolist()
columns_list.remove('Weather Type')
encoder = ce.OrdinalEncoder(cols=columns_list)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [16]:
X_train.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
10163,1,1,1,1,1,1,1,1,1,1
12929,2,2,2,2,2,2,2,2,2,2
5735,1,3,3,3,3,3,3,2,3,2
440,3,4,4,4,1,4,4,3,4,1
2018,4,3,1,5,3,5,5,1,5,2


In [17]:
X_test.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
4111,11.0,86,29.0,13,4,2183.0,12,4,20,2
10607,104.0,56,24.0,69,4,804.0,10,1,4,2
7372,59.0,52,6.0,10,3,1709.0,5,2,5,1
11786,20.0,30,15.0,4,1,-1.0,6,3,3,2
12227,7.0,56,36.0,92,3,3173.0,8,2,23,1


In [23]:
# Model 
# Instantiate the classifier
rfc = RandomForestClassifier(random_state=0)
# Fit the model
rfc.fit(X_train, y_train)
# Predict the test set results
y_pred = rfc.predict(X_test)
# Check accuracy score 
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with 10 decision-trees : 0.8316


In [22]:
# With n_estimators = 100
# Instantiate the classifier 
rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)
# Fit the model
rfc_100.fit(X_train, y_train)
# Predict on the test set results
y_pred_100 = rfc_100.predict(X_test)
# Check accuracy score 
print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred_100)))

Model accuracy score with 100 decision-trees : 0.8316


In [25]:
# View the feature scores
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

Cloud Cover             0.207335
UV Index                0.158289
Season                  0.114914
Visibility (km)         0.110891
Temperature             0.081451
Precipitation (%)       0.075752
Humidity                0.075195
Wind Speed              0.065431
Atmospheric Pressure    0.062796
Location                0.047945
dtype: float64

In [30]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)

Confusion matrix

 [[736 151  21  47]
 [138 751  67  26]
 [ 52  31 928  22]
 [ 73  21  18 878]]


In [29]:
# Precision, recall, f1score, support
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Cloudy       0.74      0.77      0.75       955
       Rainy       0.79      0.76      0.78       982
       Snowy       0.90      0.90      0.90      1033
       Sunny       0.90      0.89      0.89       990

    accuracy                           0.83      3960
   macro avg       0.83      0.83      0.83      3960
weighted avg       0.83      0.83      0.83      3960

