In [17]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import re
from io import StringIO


## Setting Functions Up

In [42]:
def data_retrieval(file_name):

    all_data = []

    with open(file_name, 'r', encoding='latin1') as f:
        raw_lines = f.read().splitlines()

    # Remove leading spaces
    raw_lines = [line.lstrip() for line in raw_lines]

    # Identify where each question starts
    question_indices = [i for i, line in enumerate(raw_lines) if "QUESTION" in line]

    # Store blocks - each block = one question (so three contingency tables)
    blocks = []
    for idx, start in enumerate(question_indices):
        end = question_indices[idx + 1] if idx + 1 < len(question_indices) else len(raw_lines)
        blocks.append(raw_lines[start:end])

    def process_block(block):
        question_text = block[0].replace("QUESTION: ", "").strip()

        # Find where each table starts (gender, age, income)
        table_starts = [i for i, line in enumerate(block) if line.startswith("answer,")]

        if len(table_starts) < 3:
            return None  # skip malformed blocks

        def clean_lines(lines):
            return [line.encode('ascii', errors='ignore').decode('ascii').strip()
                    for line in lines if re.search(r'\w', line) and ',' in line]

        # Split into 3 tables
        gender_lines = clean_lines(block[table_starts[0]: table_starts[1]])
        age_lines = clean_lines(block[table_starts[1]: table_starts[2]])
        income_lines = clean_lines(block[table_starts[2]:])


        # Helper function to turn table into a long DataFrame
        def parse_table(lines,demographic_name):
            df = pd.read_csv(StringIO("\n".join(lines)))
            df.columns = (
                df.columns
                .str.strip()
                .str.replace(r'\ufeff', '', regex=True)
                .str.replace(r'[^\x20-\x7E]', '', regex=True)
                .str.lower()
            )

            df = df.loc[:, ~df.columns.duplicated()]
            df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True) # replace any double/triple/etc spaces with single space.


            # Collapsing Age and Income Categories

            if demographic_name == "Income":

              df = df.rename(columns={'up to 22 800': 'Low Income'})
              df = df.rename(columns={'22 800 up to 43 200': 'Medium Income'})
              df["High Income"] = df["43 200 up to 98 400"] + df["98 400 and more"]
              df = df.drop(columns=['prefer not to say'])
              df = df.drop(columns=['43 200 up to 98 400'])
              df = df.drop(columns=['98 400 and more'])


            elif demographic_name == "Age":

              df = df.rename(columns={'generation x (baby bust) (1965-1979)': 'Middle Aged'})
              df = df.rename(columns={'traditionals & baby boomer (1922-1964)': 'Older'})
              df["Younger"] = df['millennials / generation y (1980-1994)'] + df['igen / gen z (1995-2012)']
              df = df.drop(columns=['millennials / generation y (1980-1994)'])
              df = df.drop(columns=['igen / gen z (1995-2012)'])


            df_long = df.melt(id_vars=['answer', 'country'], var_name='Group', value_name='Count')
            df_long['Demographic'] = demographic_name
            df_long['Question'] = question_text


            return df_long



        gender_df = parse_table(gender_lines, "Gender")
        age_df = parse_table(age_lines, "Age")
        income_df = parse_table(income_lines, "Income")


        final_df = pd.concat([gender_df, age_df, income_df], ignore_index=True)
        final_df = final_df.applymap(lambda x: x.encode('ascii', errors='ignore').decode('ascii') if isinstance(x, str) else x)
        final_df = final_df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)

        return final_df

    fixed_blocks = []

    for i, b in enumerate(blocks):

      # introduce exception to spot if a model didn't fit and the reason.
        try:
            processed = process_block(b)
            if processed is not None:
                fixed_blocks.append(processed)
        except Exception as e:
            print(f"\n\n❌ Error in block {i} | Error: {e}")

    all_data = pd.concat(fixed_blocks, ignore_index=True)

    return all_data


In [43]:
def create_models(data):

    models = {}
    interactions_final = {}

    for (question, demographic), subset in data.groupby(['Question', 'Demographic']):

        try:
            subset = subset.dropna(subset=['Count', 'answer', 'Group', 'country'])

            # Set the first observed value as the reference for each categorical variable
            for col in ['answer', 'Group', 'country']:
                if col in subset.columns:
                    categories = list(subset[col].dropna().unique())
                    subset[col] = pd.Categorical(subset[col], categories=categories, ordered=False)


            # Fit Negative Binomial GLM model

            model = smf.glm(
                formula="Count ~ answer * Group + country",
                data=subset,
                family=sm.families.NegativeBinomial()
            ).fit()

            models[(question, demographic)] = model

            # Extract only the interaction terms and save
            interactions = model.params[[name for name in model.params.index if ':' in name]]
            interactions_final[(question, demographic)] = interactions

            # Add this line to confirm that no exception has been raised.

            print(f"\n====== MODEL FITTED: {question} | {demographic} rows ======")

        # In case there is an error, this section of code allows us to see which part of the data exactly caused it.
        # This was massively helpful in debugging.
        except Exception as e:
            print("\n\n❌ FAILED MODEL for:")
            print(f"QUESTION: {question}")
            print(f"DEMOGRAPHIC: {demographic}")
            print(f"Error: {e}")
            print("-" * 80)

    return models, interactions_final


In [44]:
def interactions_table_to_df(interactions):

  # Build a DataFrame from all interaction terms
  interaction_table = pd.DataFrame()

  for (question, demographic), interactions in interactions.items():
      if not interactions.empty:
          temp = interactions.reset_index()
          temp.columns = ['Interaction', 'Estimate']
          temp['Question'] = question
          temp['Demographic'] = demographic
          interaction_table = pd.concat([interaction_table, temp], ignore_index=True)

  # Reorder columns for clarity
  interaction_table = interaction_table[['Question', 'Demographic', 'Interaction', 'Estimate']]

  return interaction_table


In [45]:
# This function takes in data (for a group) and performs a series of LRT tests, as shown in our report.

def lr_test_table(data):

    significance_table = []

    for (question, demographic), subset in data.groupby(["Question", "Demographic"]):


        print((question, demographic))


        try:

            # Fit Main Effects Model (no interaction)
            model_main = smf.glm(
                formula="Count ~ answer + Group + country",
                data=subset,
                family=sm.families.NegativeBinomial()
            ).fit()

            # Fit Full Interaction Model (main effects + answer * Group)
            interaction_model = smf.glm(
                formula="Count ~ answer+ Group + country + answer * Group",
                data=subset,
                family=sm.families.NegativeBinomial()
            ).fit()


            # Perform Likelihood Ratio Test
            lr_stat = 2 * (interaction_model.llf - model_main.llf)
            df_diff = interaction_model.df_model - model_main.df_model
            p_value = stats.chi2.sf(lr_stat, df_diff)

            # Decide significance
            significance = "Yes" if p_value < 0.05 else "No"

            # Save to results
            significance_table.append({
                "Question": question,
                "Demographic": demographic,
                "Significant?": significance,
                "p-value": round(p_value, 4)
            })

        except Exception as e:
            print(f"❌ Error fitting models for: {question} | {demographic}")
            print(e)
            print("-" * 60)

    # Final formatted table

    significance_table = pd.DataFrame(significance_table)


    # Pivot into a table
    pivot_table = significance_table.pivot(index='Question', columns='Demographic', values='p-value')

    # Colour code significance
    def highlight_pval(val):
        color = 'green' if val <= 0.05 else 'red'
        return f'background-color: {color}'

    styled_table = pivot_table.style.applymap(highlight_pval)



    return styled_table



In [46]:
# load all data

spending_data = data_retrieval("spending_all.csv")
current_ownership_jewelry_watches_data = data_retrieval("current_ownership_jewelry_watches.csv")
decision_making_data = data_retrieval("decision_making_all.csv")
knowledge_data = data_retrieval("knowledge_all.csv")
purchase_frequency_data = data_retrieval("purchase_frequency_comsetics_fashion.csv")
total_ownership_fashion_watches_jewelry_data = data_retrieval("total_ownership_fashion_watches_jewelry.csv")


  final_df = final_df.applymap(lambda x: x.encode('ascii', errors='ignore').decode('ascii') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.encode('ascii', errors='ignore').decode('ascii') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.encode('ascii', errors='ignore').decode('ascii') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.encode('ascii', errors='ignore').decode('ascii') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.replace('"', '') if isinstance(x, str) else x)
  final_df = final_df.applymap(lambda x: x.encode('ascii', errors='ignore').decode('ascii') if isinstance(x, str) else x)
  final_df = final_df.applymap

## Purchase Frequency Analysis

In [47]:
purchase_frequency_models, purchase_frequency_interactions = create_models(purchase_frequency_data)














In [48]:
# convert to dataframe and display

df = interactions_table_to_df(purchase_frequency_interactions)

df

Unnamed: 0,Question,Demographic,Interaction,Estimate
0,How often do you buy cosmetic products from pr...,Age,answer[T.At least once a month]:Group[T.Middle...,-0.164613
1,How often do you buy cosmetic products from pr...,Age,answer[T.At least once every 3 months]:Group[T...,-0.377682
2,How often do you buy cosmetic products from pr...,Age,answer[T.At least once every 6 months]:Group[T...,-0.602082
3,How often do you buy cosmetic products from pr...,Age,answer[T.At least once a year]:Group[T.Middle ...,-0.080059
4,How often do you buy cosmetic products from pr...,Age,answer[T.Less often]:Group[T.Middle Aged],-0.602292
...,...,...,...,...
60,How often have you bought fashion and accessor...,Income,answer[T.Don't know]:Group[T.Medium Income],-0.152850
61,How often have you bought fashion and accessor...,Income,answer[T.3 5 times]:Group[T.High Income],0.381364
62,How often have you bought fashion and accessor...,Income,answer[T.6 9 times]:Group[T.High Income],0.595365
63,How often have you bought fashion and accessor...,Income,answer[T.10 times or more often]:Group[T.High ...,1.144022


In [49]:
# Display styled table with LRT results

lr_test_table(purchase_frequency_data)

('How often do you buy cosmetic products from premium/luxury brands for yourself or someone else?', 'Age')
('How often do you buy cosmetic products from premium/luxury brands for yourself or someone else?', 'Gender')
('How often do you buy cosmetic products from premium/luxury brands for yourself or someone else?', 'Income')
('How often have you bought fashion and accessories from a premium/luxury brand in a physical store in the past 2 years? If you are not sure,  please estimate.', 'Age')
('How often have you bought fashion and accessories from a premium/luxury brand in a physical store in the past 2 years? If you are not sure,  please estimate.', 'Gender')
('How often have you bought fashion and accessories from a premium/luxury brand in a physical store in the past 2 years? If you are not sure,  please estimate.', 'Income')




('How often have you bought fashion and accessories from a premium/luxury brand on the internet in the past 2 years? If you are not sure,  please estimate.', 'Age')
('How often have you bought fashion and accessories from a premium/luxury brand on the internet in the past 2 years? If you are not sure,  please estimate.', 'Gender')
('How often have you bought fashion and accessories from a premium/luxury brand on the internet in the past 2 years? If you are not sure,  please estimate.', 'Income')


  styled_table = pivot_table.style.applymap(highlight_pval)


Demographic,Age,Gender,Income
Question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
How often do you buy cosmetic products from premium/luxury brands for yourself or someone else?,0.9947,0.9984,0.9946
"How often have you bought fashion and accessories from a premium/luxury brand in a physical store in the past 2 years? If you are not sure, please estimate.",0.8638,0.9744,0.8533
"How often have you bought fashion and accessories from a premium/luxury brand on the internet in the past 2 years? If you are not sure, please estimate.",0.8427,0.9927,0.915


## Decision Making

In [50]:
decision_making_models, decision_making_interactions = create_models(decision_making_data)



















In [52]:
# convert to dataframe and display

df = interactions_table_to_df(decision_making_interactions)

df

Unnamed: 0,Question,Demographic,Interaction,Estimate
0,Before buying your last cosmetic product from ...,Age,answer[T.A couple of days]:Group[T.Middle Aged],0.305604
1,Before buying your last cosmetic product from ...,Age,answer[T.Up to a week]:Group[T.Middle Aged],-0.078560
2,Before buying your last cosmetic product from ...,Age,answer[T.Up to a month]:Group[T.Middle Aged],-0.300888
3,Before buying your last cosmetic product from ...,Age,answer[T.Up to 3 months]:Group[T.Middle Aged],0.131441
4,Before buying your last cosmetic product from ...,Age,answer[T.Longer than 3 months]:Group[T.Middle ...,0.948624
...,...,...,...,...
125,How long did you usually think about/plan befo...,Income,answer[T.Up to three months]:Group[T.High Income],-0.035909
126,How long did you usually think about/plan befo...,Income,answer[T.Up to six months]:Group[T.High Income],0.095795
127,How long did you usually think about/plan befo...,Income,answer[T.Over a year]:Group[T.High Income],-0.448508
128,How long did you usually think about/plan befo...,Income,answer[T.Differs every time]:Group[T.High Income],-0.360523


In [53]:
# Display styled table with LRT results

lr_test_table(decision_making_data)

('Before buying your last cosmetic product from a premium/luxury brand,  how long did you think about it?', 'Age')
('Before buying your last cosmetic product from a premium/luxury brand,  how long did you think about it?', 'Gender')
('Before buying your last cosmetic product from a premium/luxury brand,  how long did you think about it?', 'Income')
('Before buying your last piece of jewelry from a premium/luxury brand,  how long did you think about it?', 'Age')
('Before buying your last piece of jewelry from a premium/luxury brand,  how long did you think about it?', 'Gender')
('Before buying your last piece of jewelry from a premium/luxury brand,  how long did you think about it?', 'Income')
('Before buying your last watch from a premium/luxury brand,  how long did you think about it?', 'Age')
('Before buying your last watch from a premium/luxury brand,  how long did you think about it?', 'Gender')




('Before buying your last watch from a premium/luxury brand,  how long did you think about it?', 'Income')
('How long did you usually think about/plan before buying fashion and accessories from a premium/luxury brand?', 'Age')
('How long did you usually think about/plan before buying fashion and accessories from a premium/luxury brand?', 'Gender')
('How long did you usually think about/plan before buying fashion and accessories from a premium/luxury brand?', 'Income')


  styled_table = pivot_table.style.applymap(highlight_pval)


Demographic,Age,Gender,Income
Question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Before buying your last cosmetic product from a premium/luxury brand, how long did you think about it?",0.9984,0.9957,0.9976
"Before buying your last piece of jewelry from a premium/luxury brand, how long did you think about it?",0.9653,0.9831,0.9926
"Before buying your last watch from a premium/luxury brand, how long did you think about it?",0.9529,0.9904,0.9817
How long did you usually think about/plan before buying fashion and accessories from a premium/luxury brand?,0.9357,0.9994,0.9982


## Spending

In [54]:
spending_models, spending_interactions = create_models(spending_data)



















In [55]:
df = interactions_table_to_df(spending_interactions)

df

Unnamed: 0,Question,Demographic,Interaction,Estimate
0,How much are you willing to pay for a cosmetic...,Age,answer[T.From $25 up to less than $50]:Group[T...,-0.267567
1,How much are you willing to pay for a cosmetic...,Age,answer[T.From $50 up to less than $100]:Group[...,-0.066184
2,How much are you willing to pay for a cosmetic...,Age,answer[T.From $100 up to less than $150]:Group...,-0.196362
3,How much are you willing to pay for a cosmetic...,Age,answer[T.From $150 up to less than $200]:Group...,-0.038431
4,How much are you willing to pay for a cosmetic...,Age,answer[T.$200 or more]:Group[T.Middle Aged],1.245169
...,...,...,...,...
105,How much are you willing to pay for fashion an...,Income,answer[T.$50 up to less than $100]:Group[T.Hig...,0.465953
106,How much are you willing to pay for fashion an...,Income,answer[T.$100 up to less than $200]:Group[T.Hi...,1.251695
107,How much are you willing to pay for fashion an...,Income,answer[T.$200 up to less than $500]:Group[T.Hi...,2.142607
108,How much are you willing to pay for fashion an...,Income,answer[T.More than $500]:Group[T.High Income],3.116830


In [56]:
# Display styled table with LRT results

lr_test_table(spending_data)

('How much are you willing to pay for a cosmetic product from a premium/luxury brand for yourself or someone else?', 'Age')
('How much are you willing to pay for a cosmetic product from a premium/luxury brand for yourself or someone else?', 'Gender')
('How much are you willing to pay for a cosmetic product from a premium/luxury brand for yourself or someone else?', 'Income')
('How much are you willing to pay for a piece of jewelry from a premium/luxury brand for yourself or someone else?', 'Age')
('How much are you willing to pay for a piece of jewelry from a premium/luxury brand for yourself or someone else?', 'Gender')
('How much are you willing to pay for a piece of jewelry from a premium/luxury brand for yourself or someone else?', 'Income')
('How much are you willing to pay for a watch from a premium/luxury brand for yourself or someone else?', 'Age')
('How much are you willing to pay for a watch from a premium/luxury brand for yourself or someone else?', 'Gender')




('How much are you willing to pay for a watch from a premium/luxury brand for yourself or someone else?', 'Income')
('How much are you willing to pay for fashion and accessories from a premium/luxury brand for yourself or someone else? / Willingness to pay: clothes', 'Age')
('How much are you willing to pay for fashion and accessories from a premium/luxury brand for yourself or someone else? / Willingness to pay: clothes', 'Gender')
('How much are you willing to pay for fashion and accessories from a premium/luxury brand for yourself or someone else? / Willingness to pay: clothes', 'Income')


  styled_table = pivot_table.style.applymap(highlight_pval)


Demographic,Age,Gender,Income
Question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
How much are you willing to pay for a cosmetic product from a premium/luxury brand for yourself or someone else?,0.9653,0.9211,0.3374
How much are you willing to pay for a piece of jewelry from a premium/luxury brand for yourself or someone else?,0.8559,0.5837,0.1722
How much are you willing to pay for a watch from a premium/luxury brand for yourself or someone else?,0.9933,0.719,0.1265
How much are you willing to pay for fashion and accessories from a premium/luxury brand for yourself or someone else? / Willingness to pay: clothes,0.9999,0.9793,0.2323
