In [1]:
import pandas as pd
import numpy as np
from delai.ml_logic.preprocessing import preprocess_X, preprocess_y
from delai.data.local_disk import get_pandas_chunk

In [2]:
df_X = pd.read_csv('../raw_data/initial_X_train.csv')
df_y = pd.read_csv('../raw_data/initial_y_train.csv')

X = preprocess_X(df_X)
y = preprocess_y(df_y)

✅ preprocess_X() done
✅ preprocess_y() done


In [42]:
print(np.unique(y, return_counts=True)[1] *100 / len(y))
print(np.unique(y))

[0 1 2 3]


## Baseline

Assume that the model just returns OnTime_Early for every prediction, what is the F1?

I.e This now becomes a binary score?

In [56]:
value_counts = np.unique(y, return_counts=True)[1]
on_time_early_true = value_counts[2]
tp = on_time_early_true
fn = 0 # we aren't predicting anything other than positives
fp = len(y)- on_time_early_true
tn = 0 # we don't have any true negatives as no negatives are predicted

accuracy = (tp + tn) / (tp + tn + fp + fn)
print('accuracy:', accuracy)

recall = tp / (tp + fn)
print('recall:',recall)

precision = tp / (tp + fp)
print('precision:', precision)

f1 = 2 * (precision * recall) / (precision + recall)
print('f1 score:', f1)

accuracy: 0.6529
recall: 1.0
precision: 0.6529
f1 score: 0.7900054449754976


In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [58]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(loss='log_loss')
sgd_model.fit(X_train,y_train)
cross_val_score(sgd_model, X_train, y_train, cv=5, scoring='accuracy')


array([0.21142857, 0.65642857, 0.48928571, 0.65642857, 0.65428571])

In [59]:
y_pred =sgd_model.predict(X_test)
pd.DataFrame(y_pred).value_counts()/len(y_pred)

2    0.998333
3    0.001667
dtype: float64

# Altering the inputs

The idea here is to change the distribution of data so that a lower proportion are on_time and see the impact on the model 

In [74]:
new_df_y = pd.DataFrame(y)
new_y_2 = new_df_y.loc[new_df_y[0] == 2].sample(2000)
new_y_not2 = new_df_y.loc[new_df_y[0] != 2]
new_y = new_y_2.append(new_y_not2)
new_y.value_counts()/len(new_y)





  new_y = new_y_2.append(new_y_not2)


3    0.388594
2    0.365564
0    0.194114
1    0.051727
dtype: float64

In [75]:
new_X = X.iloc[new_y.index]
new_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,689,690,691,692,693,694,695,696,697,698
1457,0.078372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101168,-0.994869,1.000000e+00,6.123234e-17,1.000000e+00,6.123234e-17,0.002618,0.999997,0.002618,0.999997
9281,0.078979,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.937752,0.347305,8.660254e-01,5.000000e-01,1.000000e+00,6.123234e-17,0.002618,0.999997,0.002618,0.999997
6351,0.151681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101168,-0.994869,-5.000000e-01,-8.660254e-01,-1.000000e+00,-1.836970e-16,0.007854,0.999969,0.007854,0.999969
5771,0.327055,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.394356,0.918958,-2.449294e-16,1.000000e+00,-2.449294e-16,1.000000e+00,0.010472,0.999945,0.010472,0.999945
13,0.426083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.724793,0.688967,5.000000e-01,8.660254e-01,1.000000e+00,6.123234e-17,0.002618,0.999997,0.002618,0.999997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9984,0.034022,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.485302,-0.874347,-5.000000e-01,-8.660254e-01,-1.000000e+00,-1.836970e-16,0.007854,0.999969,0.007854,0.999969
9985,0.222357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.485302,-0.874347,5.000000e-01,8.660254e-01,1.000000e+00,6.123234e-17,0.002618,0.999997,0.002618,0.999997
9992,0.121709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.299363,-0.954139,1.224647e-16,-1.000000e+00,1.224647e-16,-1.000000e+00,0.005236,0.999986,0.005236,0.999986
9993,0.036047,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.968077,-0.250653,5.000000e-01,-8.660254e-01,1.224647e-16,-1.000000e+00,0.005236,0.999986,0.005236,0.999986


In [76]:
sgd_model_2 = SGDClassifier(loss='log_loss')
sgd_model_2.fit(new_X,new_y[0])
cross_val_score(sgd_model, new_X, new_y[0], cv=5, scoring='accuracy')

array([0.20182648, 0.21755027, 0.06032907, 0.19835466, 0.19378428])