### This file shows the complete process of developing the model from preprocessing to training and finding suitable parameters

## Preprocessing

First, read the csv file and drop unused columns.

The following function drops unused columns, calculates the mean of collinear variables and adds a new column "month" describing in which month the observation was measured.

In [221]:
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_data_median(data):
    def mean_by_height(substring):
        return data.filter(like = substring).filter(like = ".mean").median(axis = 1)
    
    def std_by_height(substring):
        return data.filter(like = substring).filter(like = ".std").median(axis = 1)
    
    data = data.drop(["id", "partlybad"], axis = 1)
    class2 = np.array(["nonevent", "event"])
    data.insert(1, "class2", class2[(data["class4"] != "nonevent").astype(int)])

    new_data = pd.DataFrame()
    new_data[['date', 'class2']] = data[['date', 'class2']]
    new_data["CO.mean"] = mean_by_height("CO")
    new_data["CO.std"] = std_by_height("CO")
    new_data["H2O.mean"] = mean_by_height("H2O")
    new_data["H2O.std"] = std_by_height("H2O")
    new_data["RHIRGA.mean"] = mean_by_height("RHIRGA")
    new_data["RHIRGA.std"] = std_by_height("RHIRGA")
    new_data["NOx.mean"] = mean_by_height("NOx")
    new_data["NOx.std"] = std_by_height("NOx")
    new_data["NET.mean"] = mean_by_height("NET")
    new_data["NET.std"] = std_by_height("NET")
    new_data["NO.mean"] = data.iloc[:,27:39].filter(like = ".mean").median(axis = 1)
    new_data["NO.std"] = data.iloc[:,27:39].filter(like = ".std").median(axis = 1)
    new_data["O3.mean"] = data.iloc[:,51:61].filter(like = ".mean").median(axis = 1)
    new_data["O3.std"] = data.iloc[:, 51:61].filter(like = ".std").median(axis = 1)
    new_data[['Pamb0.mean', 'Pamb0.std', 'PAR.mean', 'PAR.std', 'PTG.mean', 'PTG.std', 'RGlob.mean', 'RGlob.std']] = data[['Pamb0.mean', 'Pamb0.std', 'PAR.mean', 'PAR.std', 'PTG.mean', 'PTG.std', 'RGlob.mean', 'RGlob.std']]
    new_data[['RPAR.mean', 'RPAR.std', 'SO2168.mean', 'SO2168.std', 'SWS.mean', 'SWS.std']] = data[['RPAR.mean', 'RPAR.std', 'SO2168.mean', 'SO2168.std', 'SWS.mean', 'SWS.std']]
    new_data["T.mean"] = data.filter(like = "T").iloc[:,4:].filter(like = ".mean").median(axis = 1)
    new_data["T.std"] = data.filter(like = "T").iloc[:,4:].filter(like = ".std").median(axis = 1)
    new_data[['UV_A.mean', 'UV_A.std', 'UV_B.mean', 'UV_B.std', 'CS.mean', 'CS.std']] = data[['UV_A.mean', 'UV_A.std', 'UV_B.mean', 'UV_B.std', 'CS.mean', 'CS.std']]
    new_data.set_index('date')

    return new_data

Now, transform the train data

In [222]:
train_data = transform_data_median(pd.read_csv("npf_train.csv"))

train_data.head()

Unnamed: 0,date,class2,CO.mean,CO.std,H2O.mean,H2O.std,RHIRGA.mean,RHIRGA.std,NOx.mean,NOx.std,...,SWS.mean,SWS.std,T.mean,T.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,2000-01-17,event,368.718684,0.307718,4.370921,0.145338,74.526311,3.099995,0.630081,0.084554,...,937.88,0.6,-0.996623,0.260461,2.492491,1.31088,0.031587,0.018122,0.000243,3.5e-05
1,2000-02-28,nonevent,378.140192,1.019733,7.195852,0.196708,99.990803,0.930678,3.833197,0.833653,...,936.0,0.707107,1.911915,0.279675,0.295937,0.177836,0.00514,0.003552,0.003658,0.00094
2,2000-03-24,event,372.986612,0.739138,3.564013,0.25488,57.290658,13.988569,0.909836,0.196336,...,923.745098,2.16188,0.54153,2.226999,14.434789,8.627312,0.353743,0.272472,0.000591,0.000191
3,2000-03-30,event,375.596225,0.549797,6.51246,0.511737,68.153041,8.804628,2.297301,0.485434,...,925.622642,1.389887,6.339887,2.650631,16.077513,9.984686,0.568242,0.45183,0.002493,0.000466
4,2000-04-04,nonevent,377.635303,0.420603,6.223485,0.202019,88.604606,8.204318,3.069394,0.87808,...,921.727273,2.578074,1.885328,1.551858,9.710422,7.054069,0.339135,0.291457,0.004715,0.000679


In [271]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def transform_X(data):

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(data = scaler.fit_transform(data), columns = data.columns)

    pca = PCA(n_components = 12)
    X_scaled = pd.DataFrame(data = pca.fit_transform(X_scaled))

    return X_scaled

# Binary classification

After preprocessing, we can start to develop the model. From previous testing, it was found that logistic regression is the best performer. But before that, we need to scale our data with standard scaler.

In [276]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X_train = train_data.iloc[:,2:]
y_train = (train_data.iloc[:,1] == 'event').astype(float)

X_train_scaled = transform_X(X_train)

### Estimating accuracy for binary classification

To avoid overfitting, we reduce the dimensions with PCA. We will choose number of components with k-fold cross-validation on logistic regression. L2 regularization is used to lower the variance to avoid overfitting.

In [277]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate


clf = LogisticRegression(penalty = 'l2', C = 1/4, solver = "saga", max_iter = 5000)

scores = cross_validate(
    clf, X_train_scaled, y_train, cv = 5, scoring = ('accuracy', 'neg_mean_squared_error')
)

est_acc = scores['test_accuracy'].mean()
mean_squared = -scores['test_neg_mean_squared_error'].mean()
print(f'The estimated accuracy of the model is: {est_acc}')
print(f'The estimated MSE is {mean_squared}')

The estimated accuracy of the model is: 0.8685133239831696
The estimated MSE is 0.13148667601683028


# Predictions on the test data

In [278]:
test_data = transform_data_median(pd.read_csv("npf_test_hidden.csv"))

X_test = test_data.iloc[:,2:]

X_test_scaled = transform_X(X_test)

predictions = pd.DataFrame(clf.fit(X_train_scaled, y_train).predict(X_test_scaled), columns = ["class"])
predictions = predictions["class"].map({1.0: 'event', 0.0:'nonevent'})
predictions.to_csv("answers.csv")

predictions.head()

0    nonevent
1       event
2    nonevent
3    nonevent
4    nonevent
Name: class, dtype: object

### Multi-class

Transform the data in the same way, but keep the original classes. User random forest.

In [300]:
from sklearn.ensemble import RandomForestClassifier

clf_randomForest = RandomForestClassifier()
y_multiclass, uniques = pd.factorize(pd.read_csv("npf_train.csv")["class4"])

scores = cross_validate(
    clf_randomForest, X_train, y_multiclass, cv = 10, scoring = ('accuracy', 'neg_mean_squared_error')
)

random_forest_predictions = pd.DataFrame(clf_randomForest.fit(X_train, y_multiclass).predict(X_test), columns = ["class"])
random_forest_predictions = random_forest_predictions["class"].map({0: 'Ib', 1:'nonevent', 2:'II', 3:'Ia'})
random_forest_predictions.to_csv("multiclass_answers.csv")

scores["test_accuracy"].mean()

0.651017576318224

In [299]:
uniques

Index(['Ib', 'nonevent', 'II', 'Ia'], dtype='object')