<b> Run below cells from the folder that contains ads16_dataset/ unzipped </b>

In [1]:
import pandas as pd
import glob
import pathlib
import re

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
# Global constants
g_userPart1PathPrefix = "./ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Corpus/Corpus/"
g_userPart2PathPrefix = "./ads16-dataset/ADS16_Benchmark_part2/ADS16_Benchmark_part2/Corpus/Corpus/"
g_userIdPrefix = "U0"

g_adsPart1PathPrefix = "./ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/"
g_adsPart2PathPrefix = "./ads16-dataset/ADS16_Benchmark_part2/ADS16_Benchmark_part2/Ads/Ads/"

# UDFs

## UDFs for generating Users Dataset

In [4]:
def generate_data_User( pathPrefix, userId ):
    completePath = pathPrefix + userId + "/"
    
    # INF
    infFile = userId + "-INF.csv"
    userInf_df = pd.read_csv(completePath + infFile, delimiter=";")
    
    # Pref
    prefFile = userId + "-PREF.csv"
    userPref_df = pd.read_csv(completePath + prefFile, delimiter=";")
    
    user_df = pd.concat([userInf_df, userPref_df], axis=1)
    
    # Pos
    posFile = userId + "-IM-POS.csv"
    userPos_df = pd.read_csv(completePath + posFile, delimiter=";")
    userPos_df = userPos_df.iloc[1:]
    userPos_df.reset_index(drop=True, inplace=True)
    user_df = pd.concat([user_df, userPos_df], axis=1)

    # Neg
    negFile = userId + "-IM-NEG.csv"
    userNeg_df = pd.read_csv(completePath + negFile, delimiter=";")
    userNeg_df = userNeg_df.iloc[1:]
    userNeg_df.reset_index(drop=True, inplace=True)
    user_df = pd.concat([user_df, userNeg_df], axis=1)

    user_df.insert(0, "UserId", userId, True)
    # user_df = user_df.set_index('UserId')
    # user_df.info()
    
    return user_df

In [5]:
def generate_data_partUsers( usersPartPathPrefix, startRange, endRange ):
    partUsers_df = pd.DataFrame()
    
    for i in range(startRange, endRange):
        thisUserIdNum = str(i)
        thisUserId = g_userIdPrefix + thisUserIdNum.zfill(3)
        # print(thisUserId)
        thisUser_df = generate_data_User(usersPartPathPrefix, thisUserId)
        partUsers_df = partUsers_df.append(thisUser_df, sort=True)
        partUsers_df.set_index('UserId')
        
    return partUsers_df

In [6]:
def generate_data_allUsers():
    allUsers_df = pd.DataFrame()

    part1Users_df = generate_data_partUsers(g_userPart1PathPrefix, 1, 61)
    allUsers_df = allUsers_df.append(part1Users_df, sort=True)

    part2Users_df = generate_data_partUsers(g_userPart2PathPrefix, 61, 121)
    allUsers_df = allUsers_df.append(part2Users_df, sort=True)

    return allUsers_df

## UDFs for generating Ads Dataset

In [7]:
def generate_data_adCats():
    adCatsLst = [['01', "Clothing & Shoes", 16],
                 ['02', "Automotive", 15],
                 ['03', "Baby Products", 15],
                 ['04', "Health & Beauty", 15],
                 ['05', "Media (BMVD)", 15],
                 ['06', "Consumer Electronics", 15],
                 ['07', "Console & Video Games", 15],
                 ['08', "DIY & Tools", 15],
                 ['09', "Garden & Outdoor living", 15],
                 ['10', "Grocery", 15],
                 ['11', "Kitchen & Home", 15],
                 ['12', "Betting", 15],
                 ['13', "Jewellery & Watches", 15],
                 ['14', "Musical Instruments", 15],
                 ['15', "Office Products", 15],
                 ['16', "Pet Supplies", 15],
                 ['17', "Computer Software", 15],
                 ['18', "Sports & Outdoors", 15],
                 ['19', "Toys & Games", 15],
                 ['20', "Dating Sites", 15]
                ] 
    adCats_df = pd.DataFrame(adCatsLst, columns =['AdCatId', 'AdCatName', 'AdCatNumAds'])
    return adCats_df

In [8]:
import re

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [ atoi(c) for c in re.split(r'(\d+)',text.split('/')[-1].split('.')[0]) ]

def generate_data_partAds( adsPartPathPrefix, startRange, endRange ):
    partAds_df = pd.DataFrame()
    partAdsRows = []
    
    for i in range(startRange, endRange):
        iStr = str(i)
        adsFiles = pathlib.Path(adsPartPathPrefix + iStr + "/").glob("*.png")
        adsFileStrLst = []
        for adsFile in adsFiles:
            adsFileStr = str(adsFile)
            adsFileStrLst.append(adsFileStr)
        adsFileStrLst.sort(key=natural_keys)
    
        for adsFileStr in adsFileStrLst:
            adId = adsFileStr.split('/')[-1].split('.')[0]
            adId = "A" +  iStr.zfill(2) + "_" + adId.zfill(2)
        #    print(adId, adsFileStr)
            partAdsRows.append([adId, adsFileStr])
        
    partAds_df = pd.DataFrame(partAdsRows, columns =['AdId', 'AdFilePath'])
    partAds_df.set_index('AdId')
        
    return partAds_df


In [9]:
# DEBUG

def generate_data_allAds():
    allAds_df = pd.DataFrame()
    
    part1Ads_df = generate_data_partAds(g_adsPart1PathPrefix, 1, 11)
    allAds_df = allAds_df.append(part1Ads_df, sort=True)

    part2Ads_df = generate_data_partAds(g_adsPart2PathPrefix, 11, 21)
    allAds_df = allAds_df.append(part2Ads_df, sort=True)

    allAds_df = allAds_df.set_index('AdId')
    return allAds_df


## UDFs for generating Ratings Dataset

In [10]:
def df_crossjoin(df1, df2):
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey').drop('_tmpkey', axis=1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res

In [11]:
# TODO: Move this UDF to top UDFs section
def generate_data_RatingsPerUser( pathPrefix, userId ):
    completePath = pathPrefix + userId + "/"

    data = ""
    rtFile = userId + "-RT.csv"
    rtNewFile = userId + "-RT-NEW.csv"
    
    with open(completePath + rtFile) as file:
        data = file.read().replace("\"", "")

    with open(completePath + rtNewFile,"w") as file:
        file.write(data)

    my_cols = [str(i) for i in range(300)]
    data3 = pd.read_csv(completePath + rtNewFile, sep=";|,", names=my_cols, header=None, engine="python")
    data3 = data3.iloc[2:]
    data3.reset_index(drop = True, inplace = True)
    
    for i in range(20):
        index = str(i)
        data3[index] = data3[index].astype('float64')
    
    data3 = data3.transpose()
        
    return data3

In [12]:
def generate_data_RatingsPartUsers( usersPartPathPrefix, startRange, endRange ):
    partUsers_df = pd.DataFrame()
    
    for i in range(startRange, endRange):
        thisUserIdNum = str(i)
        thisUserId = g_userIdPrefix + thisUserIdNum.zfill(3)
        # print(thisUserId)
        thisUser_df = generate_data_RatingsPerUser(usersPartPathPrefix, thisUserId)
        partUsers_df = partUsers_df.append(thisUser_df, sort=True)
        # partUsers_df.set_index('UserId')
        
    return partUsers_df

In [13]:
def generate_data_RatingsAllUsers():
    allUsers_df = pd.DataFrame()

    part1Users_df = generate_data_RatingsPartUsers(g_userPart1PathPrefix, 1, 61)
    allUsers_df = allUsers_df.append(part1Users_df, sort=True)

    part2Users_df = generate_data_RatingsPartUsers(g_userPart2PathPrefix, 61, 121)
    allUsers_df = allUsers_df.append(part2Users_df, sort=True)

    return allUsers_df

# Generate datasets

## Generate Users dataset

In [14]:
allUsers_df = generate_data_allUsers()
allUsers_df = allUsers_df.set_index('UserId')

In [15]:
allUsers_df.head()

Unnamed: 0_level_0,Age,Cap/Zip-Code,Countries visited,Fave Sports,Gender,Home country,Home town,Income,Last Name,Most listened musics,...,fave6,fave7,fave8,fave9,unfave1,unfave2,unfave3,unfave4,unfave5,unfave6
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U0001,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,Hidden,"Classical Music, Easy Listening, Jazz",...,,,,,news headlines,homelessness,violence,war,human rights,
U0002,26,60638,"Poland, Mexico, United States of America",Nothing,F,United States of America,Chicago,1,Hidden,"Dance Music, Electronic Music, Indie Pop, Pop (Popular music), Rock",...,,,,,seafood fish,pizza,bacon,bad odor,doglicks,
U0003,22,54942,France,"Team sports (Footbal, Baseball, Rugby, ...)",M,United States of America,Greenville,1,Hidden,"Hip Hop - Rap, Dance Music, Electronic Music, Rock",...,,,,,Ew,Useless,Ice Cream,Play Dog,More Sweets,
U0004,24,NW1 1EU,"United States of America, United Kingdom, Italy, Germany, France, Spain, Netherlands (Holland, Europe), Mexico, Aruba, Bermuda, Portugal","Individual sports‎ (Tennis, Archery, ...)",F,Great Britain,London,1,Hidden,"Alternative Music, Pop (Popular music), Rock",...,,,,,The word Moist,The word Crusty,Gross Toenails,Eye Crust,Nose Picking,
U0005,34,10000,"Russia, Slovakia, Slovenia, China, India, France, Hungary, Italy, Great Britain, Ukraine","Individual sports‎ (Tennis, Archery, ...)",F,Czech Republic,Prague,1,Hidden,"Electronic Music, Asian Pop (J-Pop or K-pop), Rock",...,,,,,Fat people,Hairy bodies,Dirty hair,"Nicab, paranja",Junkies,


In [16]:
allUsers_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, U0001 to U0120
Data columns (total 35 columns):
Age                           120 non-null int64
Cap/Zip-Code                  120 non-null object
Countries visited             120 non-null object
Fave Sports                   120 non-null object
Gender                        120 non-null object
Home country                  120 non-null object
Home town                     120 non-null object
Income                        120 non-null int64
Last Name                     120 non-null object
Most listened musics          120 non-null object
Most read books               120 non-null object
Most visited websites         120 non-null object
Most watched movies           120 non-null object
Most watched tv programmes    120 non-null object
Name                          120 non-null object
Paypal                        120 non-null object
Timepass                      120 non-null object
Type of Job                   120 non-null obj

In [17]:
allUsers_df.to_csv("AllUsers.csv", index=True)

## Generate Ads Categories Dataset

In [18]:
adCats_df = generate_data_adCats()
adCats_df = adCats_df.set_index('AdCatId')

In [19]:
adCats_df.head()

Unnamed: 0_level_0,AdCatName,AdCatNumAds
AdCatId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Clothing & Shoes,16
2,Automotive,15
3,Baby Products,15
4,Health & Beauty,15
5,Media (BMVD),15


In [20]:
adCats_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 01 to 20
Data columns (total 2 columns):
AdCatName      20 non-null object
AdCatNumAds    20 non-null int64
dtypes: int64(1), object(1)
memory usage: 480.0+ bytes


In [21]:
adCats_df.to_csv("AdCats.csv", index=True)

## Generate Ads Dataset

In [22]:
allAds_df = generate_data_allAds()

In [23]:
allAds_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, A01_01 to A20_15
Data columns (total 1 columns):
AdFilePath    300 non-null object
dtypes: object(1)
memory usage: 4.7+ KB


In [24]:
allAds_df.head()

Unnamed: 0_level_0,AdFilePath
AdId,Unnamed: 1_level_1
A01_01,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/1.png
A01_02,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/2.png
A01_03,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/3.png
A01_04,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/4.png
A01_05,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/5.png


In [25]:
allAds_df.to_csv("AllAds.csv", index=True)

## Generate Users\*Ads Dataset

In [26]:
allUsers_And_Ads_df = df_crossjoin(allUsers_df, allAds_df)

In [27]:
allUsers_And_Ads_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 36000 entries, (U0001, A01_01) to (U0120, A20_15)
Data columns (total 36 columns):
Age                           36000 non-null int64
Cap/Zip-Code                  36000 non-null object
Countries visited             36000 non-null object
Fave Sports                   36000 non-null object
Gender                        36000 non-null object
Home country                  36000 non-null object
Home town                     36000 non-null object
Income                        36000 non-null int64
Last Name                     36000 non-null object
Most listened musics          36000 non-null object
Most read books               36000 non-null object
Most visited websites         36000 non-null object
Most watched movies           36000 non-null object
Most watched tv programmes    36000 non-null object
Name                          36000 non-null object
Paypal                        36000 non-null object
Timepass                      36000 n

In [28]:
allUsers_And_Ads_df = allUsers_And_Ads_df.reset_index()
allUsers_And_Ads_df.rename(columns={'level_0':'UserId'}, inplace=True)
allUsers_And_Ads_df.rename(columns={'level_1':'AdId'}, inplace=True)
allUsers_And_Ads_df.head(302)

Unnamed: 0,UserId,AdId,Age,Cap/Zip-Code,Countries visited,Fave Sports,Gender,Home country,Home town,Income,...,fave7,fave8,fave9,unfave1,unfave2,unfave3,unfave4,unfave5,unfave6,AdFilePath
0,U0001,A01_01,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/1.png
1,U0001,A01_02,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/2.png
2,U0001,A01_03,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/3.png
3,U0001,A01_04,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/4.png
4,U0001,A01_05,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/5.png
5,U0001,A01_06,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/6.png
6,U0001,A01_07,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/7.png
7,U0001,A01_08,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/8.png
8,U0001,A01_09,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/9.png
9,U0001,A01_10,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/10.png


In [29]:
allUsers_And_Ads_df.to_csv("AllUsers_And_Ads.csv", index=False)

## Generate UsersRatings Dataset

In [30]:
allUsersRatings_df = generate_data_RatingsAllUsers()

In [31]:
allUsersRatings_df.rename(columns={0:'Rating'}, inplace=True)

In [32]:
allUsersRatings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36000 entries, 0 to 299
Data columns (total 1 columns):
Rating    36000 non-null float64
dtypes: float64(1)
memory usage: 562.5+ KB


In [33]:
allUsersRatings_df.head(301)

Unnamed: 0,Rating
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,3.0
6,1.0
7,1.0
8,1.0
9,1.0


## Generate Final Dataset

In [34]:
allUsers_And_Ads_df.head()

Unnamed: 0,UserId,AdId,Age,Cap/Zip-Code,Countries visited,Fave Sports,Gender,Home country,Home town,Income,...,fave7,fave8,fave9,unfave1,unfave2,unfave3,unfave4,unfave5,unfave6,AdFilePath
0,U0001,A01_01,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/1.png
1,U0001,A01_02,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/2.png
2,U0001,A01_03,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/3.png
3,U0001,A01_04,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/4.png
4,U0001,A01_05,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/5.png


In [35]:
allUsersRatings_df.head()

Unnamed: 0,Rating
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [36]:
allUsers_And_Ads_df.reset_index(drop=True, inplace=True)
allUsersRatings_df.reset_index(drop=True, inplace=True)
allUsers_Ads_Ratings_df = pd.concat([allUsers_And_Ads_df, allUsersRatings_df], axis=1)

In [37]:
allUsers_Ads_Ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36000 entries, 0 to 35999
Data columns (total 39 columns):
UserId                        36000 non-null object
AdId                          36000 non-null object
Age                           36000 non-null int64
Cap/Zip-Code                  36000 non-null object
Countries visited             36000 non-null object
Fave Sports                   36000 non-null object
Gender                        36000 non-null object
Home country                  36000 non-null object
Home town                     36000 non-null object
Income                        36000 non-null int64
Last Name                     36000 non-null object
Most listened musics          36000 non-null object
Most read books               36000 non-null object
Most visited websites         36000 non-null object
Most watched movies           36000 non-null object
Most watched tv programmes    36000 non-null object
Name                          36000 non-null object
Paypal   

In [38]:
allUsers_Ads_Ratings_df.head(5)

Unnamed: 0,UserId,AdId,Age,Cap/Zip-Code,Countries visited,Fave Sports,Gender,Home country,Home town,Income,...,fave8,fave9,unfave1,unfave2,unfave3,unfave4,unfave5,unfave6,AdFilePath,Rating
0,U0001,A01_01,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/1.png,1.0
1,U0001,A01_02,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/2.png,1.0
2,U0001,A01_03,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/3.png,1.0
3,U0001,A01_04,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/4.png,1.0
4,U0001,A01_05,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,ads16-dataset/ADS16_Benchmark_part1/ADS16_Benchmark_part1/Ads/Ads/1/5.png,1.0


In [39]:
allUsers_Ads_Ratings_df.to_csv("AllUsers_Ads_Ratings_df.csv", index=False)

# Scratchpad