# Necessary Installs

# Imports

In [None]:
from pathlib import Path
from fastcore.all import *
import numpy as np
import pandas as pd
from pprint import pprint
from PIL import Image
import matplotlib.pyplot as plt
import pdb
from collections import OrderedDict, defaultdict

In [None]:
#To display all elements in a cell
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

In [None]:
main = Path("../input/ads16-dataset")
part1 = main/'ADS16_Benchmark_part1'/'ADS16_Benchmark_part1'
part2 = main/'ADS16_Benchmark_part2'/'ADS16_Benchmark_part2'

ads_p1 = part1/'Ads'/'Ads'
corpus_p1 = part1/'Corpus'/'Corpus'
ads_p2 = part2/'Ads'/'Ads'
corpus_p2 = part2/'Corpus'/'Corpus'

ads_l = sorted(ads_p1.ls() + ads_p2.ls())
corpus_l = sorted(corpus_p1.ls() + corpus_p2.ls())

In [None]:
#Print contents of Ads directory
print("[INFO] Contents of the \"Ads\" directory:")
ads_l

In [None]:
#Print contents of Corpus directory
print("[INFO] Contents of the \"Corpus\" directory:")
pprint(corpus_l)

# Exploring Directory Contents

In [None]:
def image_grid(image_l, im_height, im_width, n_rows, n_cols):
    
    cnt = 0
    
    #Define grid dimensions
    grid_height = im_height * n_rows
    grid_width = im_width * n_cols
    
    #Define figure size dimensions
    fig_height = int(5 * n_rows)
    fig_width = int(5 * n_cols)
    
    #Create a new image canvas
    comp_img = Image.new('RGB', (grid_width, grid_height))
    
    for i in range(0, grid_width, im_width):
    
        for j in range(0, grid_height, im_height):
        
            #Load the image and resize dimensions
            im = Image.open(image_l[cnt])
            im.thumbnail((im_height, im_width))
            
            #Paste it the new image canvas
            comp_img.paste(im, (i, j))        
            
            #Increment counter
            cnt += 1
    
    #Display the canvas
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    ax.imshow(np.asarray(comp_img))

In [None]:
sample_ad = ads_l[0]
sample_corpus = corpus_l[0]
print("[INFO] Contents of sample \"Ads\" folder:")
sample_ad.ls()
print("[INFO] Contents of sample \"Corpus\" folder:")
sample_corpus.ls()

In [None]:
#Print a grid of images
image_grid(image_l = sample_ad.ls(), im_height = 300, im_width = 300, n_rows = 1, n_cols = 3)

In [None]:
for file_path in sample_corpus.ls():
    
    if file_path.suffix == ".csv":
    
        df_temp = pd.read_csv(file_path, sep=";", header=[0], nrows=25)
        print(f"\n[INFO] Printing some contents of {file_path.name}: \n")
        df_temp.head(25)

In [None]:
#Inspect the contents of the -IM-NEG & -IM-POS folders
im_neg = sample_corpus/"U0001-IM-NEG"
im_pos = sample_corpus/"U0001-IM-POS"

print(f"\n[INFO] Printing contents of folder {im_neg.name}\n")
image_grid(image_l = im_neg.ls(), im_height = 200, im_width = 200, n_rows = 1, n_cols = 5)

In [None]:
print(f"\n[INFO] Printing contents of folder {im_pos.name}\n")
image_grid(image_l = im_pos.ls(), im_height = 200, im_width = 200, n_rows = 1, n_cols = 5)

# Interlude: Thoughts on Data Folder Structure

1. There are 20 Ad categories —> 300 Ads and 120 Users in total.
2. Each Ad Category folder contains a collection of 15 image files. Paths to these 20 folders are contained in `ads_l` in the notebook. An Ad Category is considered to be "clicked" if it contains an advert that is "clicked" (Refer Point 9)
3. 300 Ads —> 100 Rich Media Ads, 100 Image Ads and 100 Text Ads.
3. Each User folder (for e.g. U0001) contains 6 CSV files and 2 folders. Path to each user are contained in `corpus_l`
4. (CSV #1) U0001-INF.csv (14 cols, 1 row) contains personal information (e.g. Gender, Age, Income...).
5. (CSV #2) U0001-PREF.csv (5 cols, 1 row) contains preferences (e.g. Most visited websites, most read books...). Each field is a CSV of categories (e.g. Comedy, Horror, Mystery...)
6. (CSV #3) U0001-B5.csv (3 cols, 10 rows) contains answers to Big Five Inventory-10 personality test.
7. (CSV #4 & #5) Both U0001-IM-POS.csv & U0001-IM-NEG.csv(5 cols, 2 rows each) refer to the contents of the respective folders and user reactions (e.g. "my cats" for an image in POS and "violence" for an image in NEG) to the same.
8. (CSV #6) U0001-RT.csv (20 cols, 2 rows) contains user rating FOR each ad in each ad category along a Likert Scale ranging from +1 to +5. +4 and +5 corresponds to a "click" in the paper.

# Collecting Functions

Each function corresponds to a particular CSV file in a User Folder.

For a given user, they accept dataframes and return Panda Series.

In [None]:
#For INF, PREF
def df_to_series(df):
    
    """Accepts a dataframe and returns the squeezed Panda Series"""
    
    temp = df.copy()
    inf_series = temp.squeeze(0)
    return inf_series

In [None]:
#For B5
def b5_df_to_ocean_series(df_b5):
    
    """
    Accepts a dataframe and returns a processed Pandas Series of Big Five Scores.
    
    Scoring the BFI-10 scales:
    Extraversion: 1R, 6; Agreeableness: 2, 7R; Conscientiousness: 3R, 8; Neuroticism: 4R, 9;
    Openness: 5R; 10 (R D item is reversed-scored)
    """                  
    temp = df_b5.copy()
    
    #Drop Question# column
    temp.drop(columns=["Question#"], axis=1, inplace=True)
    
    #Scale the dataframe to lie between 1 and 5
    temp += 3
    
    #Set the mask for reverse scores + Reverse the scores
    reverse_mask = [0, 2, 3, 4, 6]
    temp.iloc[reverse_mask, :] = 6 - temp.iloc[reverse_mask, :]
    
    #Create a dictionary for the different scores 
    idx_dict = {
        "E_score" : [0, 5], 
        "A_score" : [1, 6], 
        "C_score" : [2, 7], 
        "N_score" : [3, 8], 
        "O_score" : [4, 9],
    }
    
    #Create another dictionary to be converted into a dataframe
    scores_dict = {col: temp.iloc[row_l, :].squeeze(1).sum() for col, row_l in idx_dict.items()}
    ocean_series = pd.Series(scores_dict)
    
    return ocean_series

In [None]:
#For IM-POS & IM-NEG
def pos_neg_df_to_series(df):
    
    working_dict = {}
    temp = df.copy()
    column_names = temp.columns.tolist()
    #mod_column_names = [(col_name + "_location", col_name + "_reason") for col_name in column_names]    
    
    #Row 0 —> Paths, Row 1 —> Some strings
    temp_dict = {col: (temp.loc[0, col], temp.loc[1, col])  for col in column_names}
    
    #Iterate through each key and transform tuple into 2 key-value pairs
    for key, value in temp_dict.items():
        
        loc_key = key + "_location"
        reason_key = key + "_reason"
        working_dict[loc_key] = value[0]
        working_dict[reason_key] = value[1]
    
    im_series = pd.Series(working_dict)
    
    return im_series

In [None]:
#For RT
def rt_df_to_series(df):
    
    temp = df.copy()
    temp.drop(index=[0], inplace=True)
    col_names = temp.columns.tolist()
    for cat_num, cat in enumerate(col_names):
        list_responses = temp[cat].str.split(",").loc[1] #1 because 0 index was deleted - 1 remains
        for ad_num, response in enumerate(list_responses):
            new_col_name = "Cat" + str(cat_num + 1) + "_" + str(ad_num + 1)
            temp.loc[1, new_col_name] = response
    
    #Drop the old columns
    temp.drop(columns=col_names, inplace=True)
    rt_series = pd.to_numeric(temp.squeeze())
    
    return rt_series

In [None]:
df = pd.DataFrame()

for corpus in corpus_l:

    result_d = OrderedDict()

    for file_path in corpus.ls():

        if file_path.suffix == ".csv":        

            df_temp = pd.read_csv(file_path, sep=";", header=[0])
            #print(f"\n[INFO] Working on {file_path.stem}: \n")
            if ("INF" in file_path.name) or ("PREF" in file_path.name):
                result = df_to_series(df_temp)
            elif "B5" in file_path.name:
                result = b5_df_to_ocean_series(df_temp)
            elif "IM" in file_path.name:
                result = pos_neg_df_to_series(df_temp)
            elif "RT" in file_path.name:
                result = rt_df_to_series(df_temp)
            else:
                print(f"{file_path.name} not recognized")
            result.to_dict(OrderedDict)
            result_d.update(result)

    df_temp = pd.DataFrame(result_d, columns=result_d.keys(), index=[0])
    df = df.append(df_temp, ignore_index=True)
"""
result_d = OrderedDict()

for file_path in sample_corpus.ls():

    if file_path.suffix == ".csv":        

        df_temp = pd.read_csv(file_path, sep=";", header=[0])
        #print(f"\n[INFO] Working on {file_path.stem}: \n")
        if ("INF" in file_path.name) or ("PREF" in file_path.name):
            result = df_to_series(df_temp)
        elif "B5" in file_path.name:
            result = b5_df_to_ocean_series(df_temp)
        elif "IM" in file_path.name:
            result = pos_neg_df_to_series(df_temp)
        elif "RT" in file_path.name:
            result = rt_df_to_series(df_temp)
        else:
            print(f"{file_path.name} not recognized")
        #result.head()
        result.to_dict(OrderedDict)
        result_d.update(result)

df = pd.DataFrame(result_d, columns=result_d.keys(), index=[0])
df.head()

"""

In [None]:
df.columns.tolist()
df.head()