## Testing the project

### 1. Importing the modules and setting parameters

In [1]:
# Testing the project
import weather_project as we

In [2]:
print(we.ACTUAL_DATA)
print(we.FUTURE_DATA)
print(we.TARGET_NAME)

data/actual_data.csv
data/future_data.csv
RainTomorrow


### 2. Loading raw data, cleaning and splitting it

In [3]:
data = we.get_data(we.ACTUAL_DATA)

In [4]:
data

Unnamed: 0,Month,Region,MinTemp,MaxMin_Temp,Temp3pm,AmPm_Temp,Rainfall_YesNo,Sunshine_Clean,Sunshine_Types,WindDir3pm,WindGustSpeed,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure3pm,Pressure_Diff,Cloud_YesNo,RainToday,RainTomorrow
0,December,4,13.4,9.5,21.8,4.9,1.0,10.0,normal,WNW,44.0,24.0,71.0,22.0,1007.1,0.6,1,0.0,0.0
1,December,4,7.4,17.7,24.3,7.1,0.0,10.0,normal,WSW,44.0,22.0,44.0,25.0,1007.8,2.8,0,0.0,0.0
2,December,4,12.9,12.8,23.2,2.2,0.0,10.0,normal,WSW,46.0,26.0,38.0,30.0,1008.7,1.1,1,0.0,0.0
3,December,4,9.2,18.8,26.5,8.4,0.0,10.0,normal,E,24.0,9.0,45.0,16.0,1012.8,4.8,0,0.0,0.0
4,December,4,17.5,14.8,29.7,11.9,1.0,10.0,normal,NW,41.0,20.0,82.0,33.0,1006.0,4.8,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118898,December,1,20.5,14.2,33.0,8.7,0.0,10.0,normal,E,52.0,20.0,23.0,12.0,1010.1,3.1,0,0.0,0.0
118899,December,1,18.0,18.4,35.0,8.3,0.0,10.0,normal,ESE,54.0,31.0,17.0,7.0,1010.9,3.8,0,0.0,0.0
118900,December,1,17.5,19.6,34.7,6.6,0.0,10.0,normal,SE,56.0,22.0,12.0,7.0,1007.5,5.0,0,0.0,0.0
118901,December,1,20.0,18.9,38.4,7.1,0.0,10.0,normal,SSE,59.0,17.0,12.0,12.0,1002.6,4.6,1,0.0,0.0


In [5]:
data = we.get_notna_target(data, we.TARGET_NAME)

In [6]:
data.shape

(116219, 19)

In [7]:
train_data, test_data = we.get_train_test(data)

In [8]:
print(train_data.shape)
print(test_data.shape)

(29311, 18)
(86908, 18)


In [9]:
X_train, y_train = we.get_x_y_data(train_data, we.TARGET_NAME)
X_test, y_test = we.get_x_y_data(test_data, we.TARGET_NAME)

In [10]:
print(X_train.shape)
print(y_train.shape)

(29311, 17)
(29311,)


In [11]:
X_train

Unnamed: 0,Month,MinTemp,MaxMin_Temp,Temp3pm,AmPm_Temp,Rainfall_YesNo,Sunshine_Clean,Sunshine_Types,WindDir3pm,WindGustSpeed,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure3pm,Pressure_Diff,Cloud_YesNo,RainToday
12366,December,13.2,,30.6,8.8,0.0,10.0,normal,NW,,19.0,50.0,24.0,,3.5,1,0.0
12367,December,,,25.4,,0.0,10.0,normal,NE,,19.0,,59.0,,3.5,0,0.0
12368,December,16.2,16.0,32.0,9.8,0.0,10.0,normal,SE,,2.0,74.0,26.0,,3.5,1,0.0
12369,December,19.2,4.8,23.0,2.6,0.0,10.0,normal,SE,,22.0,66.0,61.0,,3.5,1,0.0
12370,December,18.8,11.4,,,0.0,10.0,normal,,,,85.0,,,3.5,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116826,December,23.4,7.6,29.8,5.6,1.0,1.1,small,W,69.0,30.0,94.0,77.0,1005.0,1.9,1,1.0
116827,December,24.2,7.5,31.3,5.1,1.0,5.5,normal,WNW,52.0,24.0,79.0,68.0,1006.8,2.9,1,0.0
116828,December,24.1,6.3,24.8,4.3,0.0,3.1,normal,SE,48.0,19.0,76.0,91.0,1007.4,1.4,1,0.0
116829,December,24.4,7.9,31.7,1.7,1.0,11.6,big,W,50.0,33.0,73.0,61.0,1005.9,1.4,1,1.0


### 3. Modelling and search optimal hyperparameters

In [12]:
features = we.get_3group_features(X_train)
print(*features, sep='\n')

['Rainfall_YesNo', 'Cloud_YesNo', 'RainToday']
['MinTemp', 'MaxMin_Temp', 'Temp3pm', 'AmPm_Temp', 'Sunshine_Clean', 'WindGustSpeed', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure3pm', 'Pressure_Diff']
['Month', 'Sunshine_Types', 'WindDir3pm']


In [13]:
model_type = "logreg"
estimator = we.get_estimator(model_type, features)
param_grid = we.get_param_grid(model_type)
search = we.get_search(estimator, param_grid)

In [14]:
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_score_)

{'logreg__C': 0.1}
0.6102652375892398


In [15]:
predict = search.predict(X_test)

In [16]:
we.print_stats(y_test, predict)


Confusion matrix:
[[58982  9513]
 [ 5798 12615]]

              precision    recall  f1-score   support

         0.0      0.910     0.861     0.885     68495
         1.0      0.570     0.685     0.622     18413

    accuracy                          0.824     86908
   macro avg      0.740     0.773     0.754     86908
weighted avg      0.838     0.824     0.829     86908



### 4. Predicting a new data

In [17]:
new_data = we.get_data(we.FUTURE_DATA)
new_data = we.get_notna_target(new_data, we.TARGET_NAME)

In [18]:
print(new_data.shape)

(25974, 19)


In [19]:
X_new, y_new = we.get_x_y_data(new_data, we.TARGET_NAME)

In [20]:
print(X_new.shape)
print(y_new.shape)

(25974, 17)
(25974,)


In [21]:
new_predict = search.predict(X_new)

In [22]:
we.print_stats(y_new, new_predict)


Confusion matrix:
[[16712  3316]
 [ 1858  4088]]

              precision    recall  f1-score   support

         0.0      0.900     0.834     0.866     20028
         1.0      0.552     0.688     0.612      5946

    accuracy                          0.801     25974
   macro avg      0.726     0.761     0.739     25974
weighted avg      0.820     0.801     0.808     25974

