In [1]:
# !pip install PyMuPDF
import fitz 
import utilities

In [2]:
DATA_DIR = '/media/umar/Data/work/voice-assistant-central/data/inferred-interests-week-later/'

In [3]:
def get_text_from_pdf(path):
    with fitz.open(path) as doc:
        text = ""
        for idx, page in enumerate(doc):
            if idx < 3:
                continue
            text += page.get_text()
    return text.split('\n')


def interests_per_persona(interest_files):
    interests = {}
    
    for interest_file in interest_files:
        raw_interests = get_text_from_pdf(interest_file)
        
        processed_content = set()
        for line in raw_interests:
            if line.strip() == 'Segment Detail View' or \
            line.strip().startswith('Generated: ') or \
            line.strip() == '':
                continue
            
#             if line.count('>') < 3:
#                 continue
            try:    
                processed_content.add(line.rsplit('>')[-2].strip() + ' > ' + line.rsplit('>')[-1].strip())
            except Exception:
                pass
            
        interests[interest_file.split('/')[-1].replace('.pdf','')] = processed_content
    
    return interests 


def get_common_inerests(interests):
    common_interests = set()
    
    for persona in interests:
        if len(common_interests) == 0:
            common_interests = set(interests[persona])
            
        common_interests = common_interests.intersection(interests[persona])
    
    return common_interests


def get_unique_interests(interests, common_interests):
    unique_interests = {}
    for persona in interests:
        
        if persona not in unique_interests:
            unique_interests[persona] = set()
        
        for interest in interests[persona]:
            if interest not in common_interests:
                unique_interests[persona].add(interest)
                
    return unique_interests


def get_exclusive_interests(interests):
    exclusive_interests = {}
    for persona in interests:
        
        if persona not in exclusive_interests:
            exclusive_interests[persona] = set(interests[persona])
        
        for other_persona in interests:
            if persona == other_persona:
                continue
                
            exclusive_interests[persona] -= set(interests[other_persona])
                
    return exclusive_interests

In [22]:
j = {1,2,3,4,5,6,7,8,9}
j -= {5,6,7,8}
j

{1, 2, 3, 4, 9}

In [4]:
interest_files = utilities.get_files_in_a_directory(DATA_DIR)

In [5]:
interests = interests_per_persona(interest_files)

In [6]:
common_interests = get_common_inerests(interests)

In [7]:
common_interests

{'AdAdvisor > Targus GET - 2831',
 'Audiences by Oracle > Business (B2B)',
 'Audiences by Oracle > Media and Entertainment',
 'Audiences by Oracle > Retail',
 'Audiences by Oracle > Technology and Computing',
 'Audiences by Ziff Davis > PCMag',
 'Audiences by Ziff Davis > Tech - B2B',
 'BlueKai Internal - Private > Data Center',
 'Brand > Apple',
 'Brand > CyberLink',
 'Brand > DELL',
 'Brand > GOOGLE',
 'Brand > MOTOROLA',
 'Brand > Nokia',
 'Brand > QUALCOMM',
 'Brand > SONY',
 'Brand > SYMANTEC',
 'Branded Data > AcquireWeb - Claritas',
 'Branded Data > Audiences by Ziff Davis',
 'Browser > Firefox',
 'Computers > Networking',
 'Computers > Networking & Communication',
 'Computers > Networking and Wireless',
 'Data Center > Phoenix',
 'Demographics > Age',
 'Demographics > Gender',
 'Device Type > Computers',
 'Hardware > Computers',
 'IP based > United States',
 'In-Market > Brand',
 'In-Market > Device Type',
 'In-Market > Retail',
 'Interest (Affinity) > Computers',
 'Interest > 

In [8]:
unique_interests = get_unique_interests(interests, common_interests)
exclusive_interests = get_exclusive_interests(interests)

In [9]:
exclusive_interests['Religion-Spirituality']

{'A/B Test Groups > Group 11',
 'Age > 40 - 44',
 'Arts & Entertainment > Movies',
 'Arts & Entertainment > TV',
 'Arts, Entertainment & Media > Broadcasting',
 'Audio & Video > MP3 Players',
 'Audio and Visual > Music Players',
 'B2B > Industries & Occupations',
 'Books & Magazines > Audio Books',
 'Brand > KitchenAid',
 'Brand > NBC',
 'Brand > Netflix',
 'Brand > PokÃ©mon',
 'Brand > Samsung',
 'Brand > Walmart',
 'Cell Phones & Mobile Carriers > Samsung',
 'Cell Phones and Plans > Accessories',
 'Cell Phones and Plans > Devices',
 'Cell Phones and Plans > Smartphones',
 'Computers > Handheld',
 'Custom Categories > Graduation Gift Buyers',
 'Devices > Apple (iPhone)',
 'Devices > Apple iPhone',
 'Devices > Samsung',
 'Electronics & Gadgets > Music Players',
 'Electronics > Audio & Video',
 'Electronics > Handhelds & Tablets',
 'Estimated Discretionary Income Percent > 40 - 49%',
 'Exercise & Fitness > Cardio Equipment',
 'Exercise & Fitness > Jump Rope',
 'Genres > Action',
 'In-Ma

In [10]:
exclusive_interests['Wine-Beverages']

{'Ability to Pay > Moderate Ability to Pay',
 'Active - 24 Months > Home Improvement',
 'Age > 18 - 24',
 'Age > 55 - 59',
 'Age Range > 65-74',
 'Age of Children in Household > 13-18',
 'All Categories Frequency > Number of Credit Purchases Last 24 Months: 11+',
 'All Categories Frequency > Number of Credit Purchases Total: 16+',
 'All Categories Frequency > Number of Purchases Last 13-18 Months: 5+',
 'All Categories Frequency > Number of Purchases Last 4-6 Months: 5+',
 'All Categories Frequency > Number of Purchases Total: 26+',
 'All Categories Recency > Total Dollars Last 10-12 Months: Dollars Spent: $31 - $154',
 'Alliant > CPG',
 'Alliant > Education',
 'Alliant > Family Interests',
 'Apparel > Ashley Stewart Buyer Propensity',
 'Apparel > Belk Buyer Propensity',
 'Apparel > Brooks Brothers Buyer Propensity',
 'Apparel > Crocs Buyer Propensity',
 'Apparel > J.JILL Buyer Propensity',
 'Apparel > Jordan Buyer Propensity',
 'Apparel > Lord & Taylor Buyer Propensity',
 'Apparel > N

In [27]:
exclusive_interests['SmartHome']

{'Age > 65 - 69',
 'Estimated Discretionary Income Percent > 30 - 39%',
 'Estimated Household Investable Assets > $1MM +',
 'Family Position > Female HOH',
 'Genre > Thriller Movie Streamers',
 'Net Worth > $1M+',
 'United States > Gifts/holiday items',
 'United States > Not For Profit',
 'United States > Young Family'}

In [12]:
exclusive_interests['Health-Fitness']

{'35-44 > 35-39',
 'AFS Products > Millenials - High',
 'Ability to Pay > Millennials - High Ability to Pay',
 'Active & Influential Social Media Users > Facebook Influencers',
 'AdAdvisor by Neustar > Style & Fashion',
 'Adstra (formerly ALC) > Consumer In-Market',
 'Advanced Demographics > Spanish Speakers',
 'Age > 0-2 Years',
 'Age of Children in Household (Kids) > Parents of Toddlers (3-5)',
 'Age of Children in Household (Kids) > Parents of Toddlers (Ages 3-5)',
 'Age of Children in Household > 3-5',
 'AmeriLINK Online Behavioral Data > Cosmetology & Beauty Professionals',
 'AmeribaseDigital > OTT & CTV',
 'AmeribaseDigital > Technology & Software Buyers By Brand',
 'Apparel > Jewelry',
 'Auto Insurance > Geico',
 'Auto Parts/Auto Repair > Auto Zone',
 'Auto Parts/Auto Repair > Jiffy Lube',
 'Auto, Cars and Trucks > Parts, Service and Tires',
 'Automotive > Auto Insurance',
 'Automotive > Auto Parts/Auto Repair',
 'Autos > Aftermarket',
 'Back To School > Consumer with Pre-School

In [13]:
exclusive_interests['Navigation-TripPlanners']

{'25-29 > Age 26',
 '25-34 > 25-29',
 '50-54 > Age 53',
 'AFS Buckets > 5',
 'Activities & Interests > Guitar',
 'Activities & Interests > National News',
 'Activities and Entertainment > PGA Tour Enthusiasts',
 'Activity & Event Tickets > Live Theater',
 'Age > 18-24',
 'Age > 6-10 Years',
 'Age of Children in Household (Kids) > Parents of Infants (0-2)',
 'Age of Children in Household (Kids) > Parents of Infants (Ages 0-2)',
 'Annual Discretionary Spending on Donation > $2,500 to $3,499',
 'Apparel > $2,500 to $2,999',
 'Apparel > ZARA Buyer Propensity',
 'Auto > Auto Loan 36 Months+',
 'Automotive > American Auto Association (AAA) Member',
 'Automotive Owners > Own a Dodge',
 'Back To School > Affluent Consumer',
 'Back To School > Consumer with Middle School Aged Kids',
 'Banking Channel Preference > In Branch Banking',
 'Big Spending Travel Intenders > Cruise Travel',
 'Brands > Hallmark/Hallmark Crown',
 'Brands > Kikkoman',
 "Brands > Macy's",
 'Brands > Quaker',
 'Brands > Safe

In [14]:
exclusive_interests['Pets-Animals']

{'AFS Buckets > 8',
 'AcquireWeb - Claritas > Sports and Entertainment',
 'Active/Interest In > Outdoor',
 'Affinity Answers > Brands',
 'Affluence Segments > Climbers',
 'Age 18-29 > Age 25-29 years',
 'Age > 19-24',
 'Age > 3-5 Years',
 'Agents and Brokers > Insurance',
 'Agriculture & Natural Resources > Mining, Oil & Gas',
 'All Categories Frequency > Number of Credit Purchases Total: 3-15',
 'All Categories Frequency > Number of Purchases Last 3 Months: 5+',
 'All Categories Recency > Ordered in Last 0-3 Months',
 'All Categories Recency > Total Dollars 0-3 Months: $352+',
 'All Categories Recency > Total Dollars 0-3 Months: Dollars Spent: $51-$351',
 'All Categories Spend > Total Dollars: $1,435+',
 'All Categories Spend > Total Dollars: $1-$179',
 'Alliant > Emerging Consumers',
 'Annual Credit Card Retail Spending > $1,500 - $2,000',
 'Annual Discretionary Donations > $2,500 - $3,499',
 'Apparel > JustFab Buyer Propensity',
 'Apparel > Shoe Carnival Buyer Propensity',
 'Apparel

In [15]:
exclusive_interests['Fashion-Style']

{'25-34 > 30-34',
 '45-54 > 45-49',
 '50-54 > Age 50',
 'AFS Buckets > 2',
 'Active - 24 Months > Home Decor & Crafting',
 'AdAdvisor - Private > Neustar AdAdvisor Custom',
 'Age > 35 - 39',
 'Age Narrow > 30-34',
 'Age Narrow > 45-49',
 'Age Narrow > Ages 30-34',
 'Age Narrow > Ages 45-49',
 'Age Range > 35-44',
 'Ailments - Treatments > Respiratory',
 'All Categories Recency > Ordered in Last 7-12 Months',
 'All Categories Recency > Total Dollars Last 7-9 Months: $1-$36',
 'AmeribaseDigital > Device Tracker - Mobile Location Data',
 'Annual Credit Card Spending on Travel, Entertainment and Dining > $1,500 - $2,000',
 'Apparel > Casual Male XL Buyer Propensity',
 'Arts & Entertainment > Sweepstakes',
 'Auto Buyers > BMW',
 'Automotive > Napa Auto Parts Buyer Propensity',
 'Automotive Owners > Hybrid Cars Propensity',
 'Baby & Toddler > Baby Wipes',
 'Back to School > Moms with College Students',
 'Big Spending Travel Intenders > International Travel',
 'Black Friday > Black Friday Sho

In [16]:
exclusive_interests['ConnectedCar']

{'40-44 > Low Segment',
 'A/B Test Groups > Group 08',
 'ABC > $100,000 Pyramid (Game Show)',
 'Affinity Answers > Movies',
 'Animation > Peter Rabbit (Movie Franchise)',
 'Animation > SpongeBob SquarePants Movie (Franchise)',
 'Animation > Trolls (Franchise)',
 'Baseball > St. Louis Cardinals',
 'Baseball > Washington Nationals',
 'Basketball > Sacramento Kings',
 'Basketball > Toronto Raptors',
 'CBS > S.W.A.T. (Drama)',
 'Communication Channel Preference > Text',
 'Estimated Household Investable Assets > $100,000 - $249,999',
 'Genre > Comedy Movie Streamers',
 'Genre > Crime Movie Streamers',
 'Genre > Documentary Movie Streamers',
 'Health & Wellness Shopping > Front of Store Same Location',
 'Hobbies & Interests > Small Business Owners',
 'Language > Italian',
 'Movies > Animation',
 'Net Worth > $500K - $749.9K',
 'OTT Network Viewers > Adult Swim Streamers',
 'OTT Network Viewers > TBS Streamers',
 'Shows > ABC',
 'Shows > CBS',
 'Shows > VH1',
 'Sports > Basketball',
 'Studios

In [17]:
exclusive_interests['Dating']

{'AcquireWeb - Claritas > Auto Ownership',
 'AcquireWeb - Claritas > Education',
 'AcquireWeb - Claritas > Employment',
 'AcquireWeb - Claritas > Healthcare',
 'AcquireWeb - Claritas > Home Improvement',
 'AcquireWeb - Claritas > Insurance',
 'AcquireWeb - Claritas > Lifestage Changes',
 'AcquireWeb - Claritas > Online Buyer',
 'AcquireWeb - Claritas > Online Shopper',
 'AcquireWeb - Claritas > Spending Model',
 'AcquireWeb - Claritas > Travel',
 'Active/Interest In > Education Programs',
 'Active/Interest In > Fitness',
 'Active/Interest In > Gardening',
 'Active/Interest In > Health',
 'Active/Interest In > Home Improvement',
 'Age of Children in Household > 0-3',
 'Age of Children in Household > 13-15',
 'Age of Children in Household > 7-9',
 'Auto Ownership > Fuel Type',
 'Auto Ownership > Vehicle Make',
 'Auto Ownership > Vehicle Style',
 'Auto Service and Repair > Changes Own Electrical DIY (IMR)',
 'Auto Service and Repair > Changes Own Fuel Caps DIY (IMR)',
 'Auto Service and R

In [18]:
exclusive_interests['No-Skill']

{'35-39 > High Segment',
 'Adherence > Adherence Supplement',
 'Adherence > Supplement Usage',
 'Age > 60 - 64',
 'Age > 70 - 74',
 'Age > Ages 35-39',
 'Ailments - Treatments > Dental',
 'Ailments - Treatments > Weight',
 'Ailments > Allergies Most Likely',
 'Ailments > Dental Issues',
 'Allergies > Allergy Medications',
 'Allergies > Allergy Sufferers',
 'Allergies > Sinusitis',
 'Amazon Prime Originals > The Boys',
 'AmeriLINK > AmeriLINK Life Event Triggers',
 'AmeriLINK CPG Categories > Baby Products',
 'AmeriLINK CPG Categories > Baking Supplies',
 'AmeriLINK CPG Categories > Gravies and Sauces',
 'AmeriLINK CPG Categories > Jellies, Jams and Nut Butters',
 'AmeriLINK CPG Categories > Pet Products',
 'AmeriLINK CPG Categories > Pickles and Olives',
 'AmeriLINK CPG Categories > Seasonings and Spices',
 'AmeriLINK Consumer Buying Behavior > Retail & Product Brands',
 'AmeriLINK Life Event Triggers > New Movers',
 'Baby Products > Baby Foods, Beverages and Snacks Purchasers',
 'Bake

In [19]:
exclusive_interests['Amazon-Only']

{'AFS Buckets > 3',
 'Account Holders > College Savings',
 'Acura > Acura RDX',
 'Acura > Acura TL',
 'Acura > Acura TLX',
 'AdAdvisor - Private > AdAdvisor Consumer Audiences',
 'AdAdvisor Consumer Audiences > Web Browsing',
 'AdAdvisor by Neustar > Life Events',
 'Adstra (formerly ALC) > Wealth',
 'Affluence Segments > Giving Back',
 'Age of Children in Household > 16-17',
 'All Categories Frequency > Number of Credit Purchases Total: 1-2',
 "America's Wealthiest > (A) - Top 10%",
 'Apparel > Ann Taylor Buyer Propensity',
 'Apparel > Asics Buyer Propensity',
 'Apparel > Dolce Vita Buyer Propensity',
 'Apparel > Foot Locker Buyer Propensity',
 'Apparel > Guess Buyer Propensity',
 'Apparel > Lilly Pulitzer Buyer Propensity',
 'Apparel > Rent the Runway Buyer Propensity',
 'Apparel > Stride Rite Buyer Propensity',
 'Apparel > Timberland Buyer Propensity',
 'Apparel > Tommy Bahama Buyer Propensity',
 'Apparel > Vera Bradley Buyer Propensity',
 'Apple > iPhone',
 'Art & Entertainment > Bo