<b> Run below cells from the folder that contains ads16_dataset/ unzipped </b>

In [296]:
import pandas as pd
import glob
import pathlib
import re

In [297]:
pd.set_option('display.max_colwidth', -1)

In [298]:
# Global constants
g_userPart1PathPrefix = "./ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Corpus/Corpus/"
g_userPart2PathPrefix = "./ads16-dataset/ADS16_Benchmark_part2/ADS16_Benchmark_part2/Corpus/Corpus/"
g_userIdPrefix = "U0"

g_adsPart1PathPrefix = "./ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/"
g_adsPart2PathPrefix = "./ads16-dataset/ADS16_Benchmark_part2/ADS16_Benchmark_part2/Ads/Ads/"

# UDFs

## UDFs for generating Users Dataset

In [299]:
def generate_data_User( pathPrefix, userId ):
    completePath = pathPrefix + userId + "/"
    
    # INF
    infFile = userId + "-INF.csv"
    userInf_df = pd.read_csv(completePath + infFile, delimiter=";")
    
    # Pref
    prefFile = userId + "-PREF.csv"
    userPref_df = pd.read_csv(completePath + prefFile, delimiter=";")
    
    user_df = pd.concat([userInf_df, userPref_df], axis=1)
    
    # Pos
    posFile = userId + "-IM-POS.csv"
    userPos_df = pd.read_csv(completePath + posFile, delimiter=";")
    userPos_df = userPos_df.iloc[1:]
    userPos_df.reset_index(drop=True, inplace=True)
    user_df = pd.concat([user_df, userPos_df], axis=1)

    # Neg
    negFile = userId + "-IM-NEG.csv"
    userNeg_df = pd.read_csv(completePath + negFile, delimiter=";")
    userNeg_df = userNeg_df.iloc[1:]
    userNeg_df.reset_index(drop=True, inplace=True)
    user_df = pd.concat([user_df, userNeg_df], axis=1)

    user_df.insert(0, "UserId", userId, True)
    # user_df = user_df.set_index('UserId')
    # user_df.info()
    
    return user_df

In [300]:
def generate_data_partUsers( usersPartPathPrefix, startRange, endRange ):
    partUsers_df = pd.DataFrame()
    
    for i in range(startRange, endRange):
        thisUserIdNum = str(i)
        thisUserId = g_userIdPrefix + thisUserIdNum.zfill(3)
        # print(thisUserId)
        thisUser_df = generate_data_User(usersPartPathPrefix, thisUserId)
        partUsers_df = partUsers_df.append(thisUser_df, sort=True)
        partUsers_df.set_index('UserId')
        
    return partUsers_df

In [301]:
def generate_data_allUsers():
    allUsers_df = pd.DataFrame()

    part1Users_df = generate_data_partUsers(g_userPart1PathPrefix, 1, 61)
    allUsers_df = allUsers_df.append(part1Users_df, sort=True)

    part2Users_df = generate_data_partUsers(g_userPart2PathPrefix, 61, 121)
    allUsers_df = allUsers_df.append(part2Users_df, sort=True)

    return allUsers_df

## UDFs for generating Ads Dataset

In [302]:
def generate_data_adCats():
    adCatsLst = [['01', "Clothing & Shoes", 16],
                 ['02', "Automotive", 15],
                 ['03', "Baby Products", 15],
                 ['04', "Health & Beauty", 15],
                 ['05', "Media (BMVD)", 15],
                 ['06', "Consumer Electronics", 15],
                 ['07', "Console & Video Games", 15],
                 ['08', "DIY & Tools", 15],
                 ['09', "Garden & Outdoor living", 15],
                 ['10', "Grocery", 15],
                 ['11', "Kitchen & Home", 15],
                 ['12', "Betting", 15],
                 ['13', "Jewellery & Watches", 15],
                 ['14', "Musical Instruments", 15],
                 ['15', "Office Products", 15],
                 ['16', "Pet Supplies", 15],
                 ['17', "Computer Software", 15],
                 ['18', "Sports & Outdoors", 15],
                 ['19', "Toys & Games", 15],
                 ['20', "Dating Sites", 15]
                ] 
    adCats_df = pd.DataFrame(adCatsLst, columns =['AdCatId', 'AdCatName', 'AdCatNumAds'])
    return adCats_df

In [303]:
import re

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [ atoi(c) for c in re.split(r'(\d+)',text.split('/')[-1].split('.')[0]) ]

def generate_data_partAds( adsPartPathPrefix, startRange, endRange ):
    partAds_df = pd.DataFrame()
    partAdsRows = []
    
    for i in range(startRange, endRange):
        iStr = str(i)
        adsFiles = pathlib.Path(adsPartPathPrefix + iStr + "/").glob("*.png")
        adsFileStrLst = []
        for adsFile in adsFiles:
            adsFileStr = str(adsFile)
            adsFileStrLst.append(adsFileStr)
        adsFileStrLst.sort(key=natural_keys)
    
        for adsFileStr in adsFileStrLst:
            adId = adsFileStr.split('/')[-1].split('.')[0]
            adId = "A" +  iStr.zfill(2) + "_" + adId.zfill(2)
        #    print(adId, adsFileStr)
            partAdsRows.append([adId, adsFileStr])
        
    partAds_df = pd.DataFrame(partAdsRows, columns =['AdId', 'AdFilePath'])
    partAds_df.set_index('AdId')
        
    return partAds_df


In [304]:
# DEBUG

def generate_data_allAds():
    allAds_df = pd.DataFrame()
    
    part1Ads_df = generate_data_partAds(g_adsPart1PathPrefix, 1, 11)
    allAds_df = allAds_df.append(part1Ads_df, sort=True)

    part2Ads_df = generate_data_partAds(g_adsPart2PathPrefix, 11, 21)
    allAds_df = allAds_df.append(part2Ads_df, sort=True)

    allAds_df = allAds_df.set_index('AdId')
    return allAds_df


# Generate datasets

## Generate Users dataset

In [305]:
allUsers_df = generate_data_allUsers()
allUsers_df = allUsers_df.set_index('UserId')

In [306]:
allUsers_df.head()

Unnamed: 0_level_0,Age,Cap/Zip-Code,Countries visited,Fave Sports,Gender,Home country,Home town,Income,Last Name,Most listened musics,...,fave6,fave7,fave8,fave9,unfave1,unfave2,unfave3,unfave4,unfave5,unfave6
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U0001,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,Hidden,"Classical Music, Easy Listening, Jazz",...,,,,,news headlines,homelessness,violence,war,human rights,
U0002,26,60638,"Poland, Mexico, United States of America",Nothing,F,United States of America,Chicago,1,Hidden,"Dance Music, Electronic Music, Indie Pop, Pop (Popular music), Rock",...,,,,,seafood fish,pizza,bacon,bad odor,doglicks,
U0003,22,54942,France,"Team sports (Footbal, Baseball, Rugby, ...)",M,United States of America,Greenville,1,Hidden,"Hip Hop - Rap, Dance Music, Electronic Music, Rock",...,,,,,Ew,Useless,Ice Cream,Play Dog,More Sweets,
U0004,24,NW1 1EU,"United States of America, United Kingdom, Italy, Germany, France, Spain, Netherlands (Holland, Europe), Mexico, Aruba, Bermuda, Portugal","Individual sports‎ (Tennis, Archery, ...)",F,Great Britain,London,1,Hidden,"Alternative Music, Pop (Popular music), Rock",...,,,,,The word Moist,The word Crusty,Gross Toenails,Eye Crust,Nose Picking,
U0005,34,10000,"Russia, Slovakia, Slovenia, China, India, France, Hungary, Italy, Great Britain, Ukraine","Individual sports‎ (Tennis, Archery, ...)",F,Czech Republic,Prague,1,Hidden,"Electronic Music, Asian Pop (J-Pop or K-pop), Rock",...,,,,,Fat people,Hairy bodies,Dirty hair,"Nicab, paranja",Junkies,


In [307]:
allUsers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, U0001 to U0120
Data columns (total 35 columns):
Age                           120 non-null int64
Cap/Zip-Code                  120 non-null object
Countries visited             120 non-null object
Fave Sports                   120 non-null object
Gender                        120 non-null object
Home country                  120 non-null object
Home town                     120 non-null object
Income                        120 non-null int64
Last Name                     120 non-null object
Most listened musics          120 non-null object
Most read books               120 non-null object
Most visited websites         120 non-null object
Most watched movies           120 non-null object
Most watched tv programmes    120 non-null object
Name                          120 non-null object
Paypal                        120 non-null object
Timepass                      120 non-null object
Type of Job                   120 non-null obj

In [308]:
allUsers_df.to_csv("AllUsers.csv", index=True)

## Generate Ads Categories Dataset

In [309]:
adCats_df = generate_data_adCats()
adCats_df = adCats_df.set_index('AdCatId')

In [310]:
adCats_df.head()

Unnamed: 0_level_0,AdCatName,AdCatNumAds
AdCatId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Clothing & Shoes,16
2,Automotive,15
3,Baby Products,15
4,Health & Beauty,15
5,Media (BMVD),15


In [311]:
adCats_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 01 to 20
Data columns (total 2 columns):
AdCatName      20 non-null object
AdCatNumAds    20 non-null int64
dtypes: int64(1), object(1)
memory usage: 480.0+ bytes


In [312]:
adCats_df.to_csv("AdCats.csv", index=True)

## Generate Ads Dataset

In [313]:
allAds_df = generate_data_allAds()

In [314]:
allAds_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 301 entries, A01_01 to A20_15
Data columns (total 1 columns):
AdFilePath    301 non-null object
dtypes: object(1)
memory usage: 4.7+ KB


In [315]:
allAds_df.head()

Unnamed: 0_level_0,AdFilePath
AdId,Unnamed: 1_level_1
A01_01,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/1.png
A01_02,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/2.png
A01_03,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/3.png
A01_04,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/4.png
A01_05,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/5.png


In [316]:
allAds_df.to_csv("AllAds.csv", index=True)

## Generate Ratings Dataset - TODO

In [205]:
# X1 Users => [UserId, .....] --> 36 cols
# X2 Ads => [AdId, .....] --> 2 cols
# Y Ratings => ~40 cols [UserId, ......, AdId, ......., Rating) ==> 120 * 300 rows

# Scratchpad

## RT

In [117]:
data = ""

with open("ADS16_Benchmark_part1/ADS16_Benchmark_part1/Corpus/Corpus/U0001/U0001-RT.csv") as file:
     data = file.read().replace("\"", "")

with open("ADS16_Benchmark_part1/ADS16_Benchmark_part1/Corpus/Corpus/U0001/U0001-RT-NEW.csv","w") as file:
     file.write(data)

In [118]:
my_cols = [str(i) for i in range(300)] # create some row names
data3 = pd.read_csv("ADS16_Benchmark_part1/ADS16_Benchmark_part1/Corpus/Corpus/U0001/U0001-RT-NEW.csv",
                                   sep=";|,",
                                   names=my_cols, 
                                   header=None, 
                                   engine="python")

In [119]:
data3 = data3.iloc[2:]

In [120]:
data3.reset_index(drop = True, inplace = True)

In [98]:
pd.set_option('display.max_columns', None)


In [121]:
for i in range(20):
    index = str(i)
    data3[index] = data3[index].astype('float64')

In [122]:
data3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,3.0,4.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [124]:
data3 = data3.transpose()

In [131]:
data3

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
295,1.0
296,1.0
297,1.0
298,1.0
