In [1]:
import re
import json
import os
import pandas as pd
import pickle
import numpy as np

In [2]:
import itertools


In [3]:
filenames = [
    [
        (folder, f'data/{folder}/{file}') 
        for file in os.listdir(f'data/{folder}') 
        if '.json' in file 
    ] 
    for folder in os.listdir('data')
]


In [4]:
data = pd.DataFrame(data = itertools.chain(*filenames))
data.columns = ['label', 'filename']

In [5]:
data

Unnamed: 0,label,filename
0,summer-with-grandkids,data/summer-with-grandkids/Image_32.jpg.json
1,summer-with-grandkids,data/summer-with-grandkids/Image_33.jpg.json
2,summer-with-grandkids,data/summer-with-grandkids/Image_104.jpg.json
3,summer-with-grandkids,data/summer-with-grandkids/Image_105.jpg.json
4,summer-with-grandkids,data/summer-with-grandkids/Image_45.jpg.json
...,...,...
308,day-at-beach,data/day-at-beach/Image_14.jpg.json
309,day-at-beach,data/day-at-beach/Image_21.jpg.json
310,day-at-beach,data/day-at-beach/Image_20.jpg.json
311,day-at-beach,data/day-at-beach/Image_56.jpg.json


# define functions for preprocessing

To extract info from the provided JPEGs, we have chosen to simply adding a label to a datapoint in binary fashion

- is an image associated with x label yes or no
- also add a "confidence filter" where we only include labels if they  are above a certain confidence

In [14]:
def read_img_json(filename):
    with open(filename, 'r') as f:
    # Load the data from the file using the custom decoder
        data = json.load(f, cls=UnquotedPropertyDecoder)
    return data

class UnquotedPropertyDecoder(json.JSONDecoder):
    def decode(self, s, *args, **kwargs):
        # Replace unquoted property names with quoted property names
        s = re.sub(r'([{,]\s*)(\w+)(\s*:)', r'\1"\2"\3', s)
        return super(UnquotedPropertyDecoder, self).decode(s, *args, **kwargs)




def get_all_labels(confidence):

    all_labels = []
    for i in data.filename:
        row_data = read_img_json(os.path.join(i))    
        #print(data)
        labels = [
            x.get('Name') 
            for x in row_data['Labels']
            if x.get('Confidence') > confidence
        ]
        all_labels.extend(labels)
    return all_labels

In [16]:


def extract_labels(X, confidence = 0.5):
    
    all_labels = sorted(list(set(get_all_labels(confidence))))

    
    def row_extract(X):
        data = read_img_json(X.filename)    
        
        labels = [
            x.get('Name')
            for x in data['Labels']
            if x.get('Confidence') > confidence
        ]        
        
        row = []
        for i in all_labels:
            row.append(1 if i in labels else 0)
            
        return pd.Series(row, all_labels)
    
    
    
    X = X.apply(row_extract, axis = 1)
    
    X.columns = [i.lower() for i in all_labels]

    return X


In [17]:
%%timeit

extract_labels(data.drop(columns=['label']))

147 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
out = extract_labels(data.drop(columns=['label']))

Next, we perform some filter selection.

We must remove highly correlated features to avoid multicolinearity and improve interpretability.  

With this data this is especially important where we have similar labels such as "accessory" vs "accessories"

The way we select which variable to keep here is by selecting the most common one.

In [22]:
def feature_select(X, correlation_limit = 0.5, confidence = 0.5):
    
    all_labels = get_all_labels(confidence)
    label_counts = pd.Series(all_labels).value_counts()
    label_counts.index = [i.lower() for i in label_counts.index.values]

    
    corrs = out.corr()
    keep_vars = []
    for i in corrs:
        highs = corrs[i][corrs[i]>correlation_limit]
        ranked = pd.merge(highs, pd.DataFrame(label_counts), left_index=True, right_index=True)
        keep_vars.append(ranked.sort_values(0, ascending=False).iloc[0].name)

    model_data = X[sorted(list(set(keep_vars)))]
    
    
    
    
    return model_data

In [23]:
%%timeit
feature_select(

    extract_labels(data.drop(columns=['label']))
)

409 ms ± 1.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
def preprocess(X, correlation_limit = 0.5, confidence = 0.5):
    
    data = extract_labels(X, confidence)
    data = feature_select(data, correlation_limit, confidence)
    
    return data

In [25]:
%%timeit

preprocess(data.drop(columns=['label']))

415 ms ± 3.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Modelling

In [26]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from imblearn.over_sampling import RandomOverSampler 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

## Logistic regression

First, we explore Logistic regression

If we can find a simple model that performs well, there is no need for the added complexity.

In [27]:


pipeline = Pipeline([
    ('preprocess', FunctionTransformer(preprocess)),
    ('oversampler', RandomOverSampler()),
    ('classifier', OneVsRestClassifier(LogisticRegression(penalty='l1',
                                                          random_state=42,
                                                          solver ='liblinear')))
])

In [28]:
pipeline.fit(data.drop(columns=['label']), data.label)


In [29]:
pipeline.score(data.drop(columns=['label']), data.label)


0.9968051118210862

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['label']), data.label, test_size=0.1, random_state=1)


In [31]:

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [32]:
param_grid = {
    'classifier__estimator__C': [1,0.1],
    'preprocess__kw_args': [
        {
            'confidence':  0.5,
            'correlation_limit': 0.5
        },
        {
            'confidence':  0.5,
            'correlation_limit': 0.7
        },
        {
            'confidence':  0.5,
            'correlation_limit': 0.9
        },
        {
            'confidence':  0.7,
            'correlation_limit': 0.5
        },
        {
            'confidence':  0.7,
            'correlation_limit': 0.7
        },
        {
            'confidence':  0.7,
            'correlation_limit': 0.9
        },
        {
            'confidence':  0.9,
            'correlation_limit': 0.5
        },
        {
            'confidence':  0.9,
            'correlation_limit': 0.7
        },
        {
            'confidence':  0.9,
            'correlation_limit': 0.9
        },
    ]
}

# Perform a grid search with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding mean score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.5}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.5}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.5}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.5}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.5}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.7}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, 'correlation_limit': 0.7}; total time=   0.7s
[CV] END classifier__estimator__C=1, preprocess__kw_args={'confidence': 0.5, '

In [33]:
pipeline = Pipeline([
    ('preprocess', FunctionTransformer(preprocess, kw_args={'confidence': 0.7, 'correlation_limit': 0.9})),
    ('oversampler', RandomOverSampler()),
    ('classifier', OneVsRestClassifier(LogisticRegression(penalty='l1',
                                                          random_state=42,
                                                          solver ='liblinear')))
])


pipeline.fit(X_train, y_train)


In [34]:
pipeline.score(X_test, y_test)

0.96875

### error analysis

In [35]:
preds = pipeline.predict(X_test)
X_test.filename[preds != y_test]

234    data/skiing-holiday/Image_46.jpg.json
Name: filename, dtype: object

When we look into the json file of this, we see that there are no labels related to obvious skiing activities, hence why the model is unable to pick up a signal

### Pipeline refinement

In [36]:
clf = pipeline.steps[-1][1]

In [37]:
features = clf.feature_names_in_[
    ((np.abs(clf.estimators_[0].coef_) > 0) |\
    (np.abs(clf.estimators_[1].coef_) > 0) |\
    (np.abs(clf.estimators_[2].coef_) > 0) |\
    (np.abs(clf.estimators_[3].coef_) > 0))[0]
]

In [38]:
features

array(['accessories', 'apparel', 'art', 'beverage', 'birthday party',
       'boat', 'bread', 'cable car', 'cake', 'canine', 'cap', 'child',
       'coat', 'couch', 'crowd', 'dating', 'denim', 'eating', 'exercise',
       'face', 'family', 'female', 'finger', 'food', 'footwear',
       'furniture', 'hat', 'head', 'helmet', 'human', 'ikebana',
       'indoors', 'kite', 'leisure activities', 'long sleeve', 'mountain',
       'ocean', 'outdoors', 'pants', 'party hat', 'people', 'photo',
       'plant', 'shorts', 'skin', 'sleeve', 'slope', 'snow', 'sport',
       'sunglasses', 'swimwear', 'thumbs up', 'transportation',
       'vacation', 'water', 'woman'], dtype=object)

The above features are those that have non-0 coefficients in at least 1 of the sub models and are necessary in the overall model.

We can refine the whiole pipeline to use just these features if LR is selected

# RF

In [39]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('preprocess', FunctionTransformer(preprocess, kw_args={'confidence': 0.7, 'correlation_limit': 0.9})),
    ('oversampler', RandomOverSampler()),
    ('classifier', RandomForestClassifier())
])
param_grid = {
	'classifier__n_estimators': [50, 100, 200],
	'classifier__max_features': ['sqrt', 'log2'],
	'classifier__max_depth': [3, 6, 9],
	'classifier__max_leaf_nodes': [3, 6, 9],
}


grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid, verbose = 2)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__max_leaf_nodes=3, classifier__n_estimators=50; total time=   0.8s
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__max_leaf_nodes=3, classifier__n_estimators=50; total time=   0.7s
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__max_leaf_nodes=3, classifier__n_estimators=50; total time=   0.7s
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__max_leaf_nodes=3, classifier__n_estimators=50; total time=   0.7s
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__max_leaf_nodes=3, classifier__n_estimators=50; total time=   0.8s
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__max_leaf_nodes=3, classifier__n_estimators=100; total time=   0.8s
[CV] END classifier__max_depth=3, classifier__max_features=sqrt, classifier__ma

In [40]:
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best hyperparameters: {'classifier__max_depth': 9, 'classifier__max_features': 'log2', 'classifier__max_leaf_nodes': 9, 'classifier__n_estimators': 100}
Best score: 0.9323308270676692




more data would mean RF maybe improve performance, currently overfitting


Better idea of scoring - currently very small test data sets 


# final model

In [41]:
pipeline = Pipeline([
    ('preprocess', FunctionTransformer(preprocess, kw_args={'confidence': 0.7, 'correlation_limit': 0.9})),
    ('oversampler', RandomOverSampler()),
    ('classifier', OneVsRestClassifier(LogisticRegression(penalty='l1',
                                                          random_state=42,
                                                          solver ='liblinear')))
])


pipeline.fit(data.drop(columns=['label']), data.label)


In [42]:
pipeline.score(data.drop(columns=['label']), data.label)

0.9904153354632588

# future work

improve correlation filter:


In [43]:
# 
# # - we can apply a chi squared test on each highly correlated feature with the target variable (since both are categorical)
# # - keep the variable that has best associated with target.
# 
# # from scipy.stats import chi2_contingency


# corrs = out.corr()
# keep_vars = []
# for i in corrs:
#     highs = corrs[i][corrs[i]>0.5]
    
#     min_p = 1
#     chosen_i = None
#     for i in highs.index:
#         chi2_stat, p_value, dof, expected = chi2_contingency(pd.crosstab(out[i], data.label))
#         print(p_value)
#         if p_value < min_p:
#             min_p = p_value
#             chosen_i = i
        


### More features

Latitudes and longitudes - map to countries - Andorra / france for skiing for example
    
time stamps - time of year could be indicative - summer vs skiing.  Could also combine with country since different countries have different seasons.

    - this could help with where labels are not great

More work on features, combinations of features - to aid the  linear nature of LR

# productionising

- refine pipeline to only use final selected features from l1
- deploy in cloud
    - either run in batch to classify users photos or  process 'online'
- monitor performance
    - distribution of predictions
    - periodically perform manual checks
- input monitoring
    - monitor distributions of input features
- periodically retrain

# suitable for running in an offline-mobile environment


the running of this model should be of nmo issue on a mobile device. Processing is very minimal.

The high intensity part I assume would be generating the label confidences in the first place.  If this is the case, then we can potemntially apply these in batch, while the classification of the images themselves can me done at runtime on the users device.