In [1]:
from utilities import read_s3_image_into_opencv, load_images, load_target_images, comparison_results_to_dataframe, smote_generator
from modeling import rf_model
from compare import ImageComparison
import pandas as pd
import numpy as np
from skimage.measure import compare_ssim as ssim
from scipy.spatial.distance import cosine, hamming
import pandas as pd
import cv2
import pickle
import numpy as np
import boto3
import botocore
import io
import math
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import GridSearchCV
    
%load_ext autoreload
%autoreload 2

In [2]:
#Establish S3 connection

client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket('capstonedatag58')

In [3]:
#Set parameters for image processing and model

params = {'batch_size': 1000, 'sample_size': 1000, 'model_transform': {'grayscale': True, 'size': (96, 96),
    'equalize': 'clahe'}, 'model_comparison': {'histogram': True, 'ssim': True, 'tweet_match': True, 'gradient_similarity': True}}


In [4]:
#Retrieve target images and fit them to the comparison class

targets, target_ids = load_target_images(my_bucket, 'harvey_scrape/images/target/', tweets=False)
comp_obj = ImageComparison(**params['model_transform'], **params['model_comparison'])
comp_obj.fit(targets)

In [5]:
#Collect images from the web scrape and compare them to the targets
#Store the comparison results and their associated image ids
tot_image_ids = []
comparisons = []

for images, image_ids in load_images(my_bucket, 'harvey_scrape/images/full/', params['batch_size'], params['sample_size']):
    comp = comp_obj.compare(images)
    tot_image_ids.extend(image_ids)
    comparisons.extend(comp)

tot_image_ids_per_comparison = [tot_image_ids[math.floor(i / len(target_ids))] for i in range(len(comparisons))]

print('Comparison results (sample):', comparisons[0:5])
print('Comparison results image IDs (sample):', tot_image_ids_per_comparison[0:5])

Comparison results (sample): [(0.0029664426699144843, 0.053507480133024858, 0.10408548915347926, 0.23673420266454484), (0.3354133159123572, 0.081918019888014074, 0.22369025405752832, 0.090164747405265061), (0.4663453284016294, 0.025881410065369984, 0.10702851923084566, 0.32810080870703773), (-0.23331694102998718, 0.020202093788756224, 0.12397121585173046, 0.025338349724860078), (-0.19697545339778924, 0.013298164025733767, 0.1039577304161191, 0.19939154476481913)]
Comparison results image IDs (sample): ['bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f']


In [6]:
#Collect the true positive test images and compare them to trargets
#Store the comparison results and the associated image ids.

tps, tp_ids = load_target_images(my_bucket, 'harvey_scrape/images/test/', tweets=False)
tp_comparisons = comp_obj.compare(tps)

tp_comparisons_ids = []
for item in tp_ids:
    tp_comparisons_ids.extend([item] * len(targets))

tp_comparisons_target_ids = []
for i in range(int(len(tp_comparisons) / len(target_ids))):
    tp_comparisons_target_ids.extend(target_ids)
    
print('True positive image comparison results (sample):', comparisons[0:5])
print('True positive image comparison results image IDs (sample):', tot_image_ids_per_comparison[0:5])

True positive image comparison results (sample): [(0.0029664426699144843, 0.053507480133024858, 0.10408548915347926, 0.23673420266454484), (0.3354133159123572, 0.081918019888014074, 0.22369025405752832, 0.090164747405265061), (0.4663453284016294, 0.025881410065369984, 0.10702851923084566, 0.32810080870703773), (-0.23331694102998718, 0.020202093788756224, 0.12397121585173046, 0.025338349724860078), (-0.19697545339778924, 0.013298164025733767, 0.1039577304161191, 0.19939154476481913)]
True positive image comparison results image IDs (sample): ['bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f', 'bcf7e41e1ca491b7c576e212851f92398f29844f']


In [7]:
#Select the true positive comparison results that are true matches using the image ids/labels
#Store them in a dataframe and run SMOTE process to generate new synthetic observations

tp_matches = [[x, y, z] for x, y, z in zip(tp_comparisons_ids, tp_comparisons_target_ids, tp_comparisons) if '_'.join(x.split('_')[0:2]) == '_'.join(y.split('_')[0:2])]

tp_df = pd.DataFrame(data=tp_matches, columns=['image', 'target', 'comparison'])

smote_comparisons, smote_image_ids, smote_target_ids = smote_generator(comparisons, tp_df, threshold=.1)

target_ids_full = []
for i in range(len(tot_image_ids)):
    target_ids_full.extend(target_ids)

print('SMOTE Comparisons (sample):', smote_comparisons[0:5])
print('SMOTE Image IDs (sample):', smote_image_ids[0:5])
print('SMOTE Target IDs (sample):', smote_target_ids[0:5])
print('Target IDs (sample):', target_ids[0:5])

  obs = tp_comparisons[np.random.randint(len(tp_comparisons), size=1)]


SMOTE Comparisons (sample): [[0.9078402928810405, 0.2366236515172559, 0.59241052506270364, 0.13609456431095321], [0.8468269704380365, 0.17294197855905119, 0.71305598612014465, 0.49509115122965469], [0.8525218236236112, 0.16021747086935867, 0.52375570037537722, 0.21530606708857739], [0.9106417536142489, 0.18334070269330527, 0.5680446976159651, 0.17637399187569591], [0.8830379435914336, 0.14190880754039342, 0.12682255484738378, 0.55128924206542529]]
SMOTE Image IDs (sample): ['hh_gator_test3_smote_0', 'hh_obama_test4_smote_1', 'hh_dinosaur_test2_smote_2', 'hh_gator_test1_smote_3', 'hh_gator_test7_smote_4']
SMOTE Target IDs (sample): ['hh_gator', 'hh_obama', 'hh_dinosaur', 'hh_gator', 'hh_gator_v2']
Target IDs (sample): ['hh_airport', 'hh_cajun_navy', 'hh_dinosaur', 'hh_gator', 'hh_gator_v2']


In [8]:
#Put image and true positive image comparison results into a pandas dataframe

df = comparison_results_to_dataframe(comparisons, tot_image_ids_per_comparison, target_ids_full)
df_smote = comparison_results_to_dataframe(smote_comparisons, smote_image_ids, smote_target_ids)
df = df.append(df_smote)
df.head()

Unnamed: 0,image,target,histogram,ssim,tweet_match,gradient_similarity,matches
0,bcf7e41e1ca491b7c576e212851f92398f29844f,hh_airport,0.002966,0.053507,0.104085,0.236734,0
1,bcf7e41e1ca491b7c576e212851f92398f29844f,hh_cajun_navy,0.335413,0.081918,0.22369,0.090165,0
2,bcf7e41e1ca491b7c576e212851f92398f29844f,hh_dinosaur,0.466345,0.025881,0.107029,0.328101,0
3,bcf7e41e1ca491b7c576e212851f92398f29844f,hh_gator,-0.233317,0.020202,0.123971,0.025338,0
4,bcf7e41e1ca491b7c576e212851f92398f29844f,hh_gator_v2,-0.196975,0.013298,0.103958,0.199392,0


In [9]:
#Run the Random Forest model to predict classifications

rf_pred, rf_recall, rf_precision, rf_feature_importances = rf_model(df, 96, 3, 4)

print('Model recall:', rf_recall)
print('Model precision:', rf_precision)
print('Model feature importances:', rf_feature_importances)

Model recall: 0.933333333333
Model precision: 0.973913043478
Model feature importances: [ 0.599908    0.14003671  0.22607083  0.03398446]
