# Reading datasets

In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import mannwhitneyu

In [2]:
BRAND_FEATURE = 'brand'
TYPE_OF_COMPANY_FEATURE = 'type_of_company'

INTERNATIONAL_TYPE = "international"
LOCAL_TYPE = "local"

INTERNATIONAL_COMPANIES = ['@corendon.nl', '@vakantiediscounter', '@prijsvrijvakanties', '@sunweb', '@tuinederland', '@dreizenvakanties', '@elizawashere']
LOCAL_COMPANIES = ['@accessibletravelnetherlands', '@cherrytraveltours', '@hollandtravel', '@lvtravelagency', '@tomstraveltours', '@tenzing_travel', '@cruisetravelnl']

In [3]:
def read_dataset(brand_name: str, folder: str):
    df = pd.read_excel(f'./data/{folder}/{brand_name}.xlsx')
    return df

In [4]:
def combine_dataset(companies: list[str], type: str):
    agencies_df = []
    
    for name in companies:
        agencies_df.append(read_dataset(name, type))
    
    agencies_df = pd.concat(agencies_df)
    agencies_df[TYPE_OF_COMPANY_FEATURE] = type
    
    return agencies_df

In [5]:
international_final_df = combine_dataset(INTERNATIONAL_COMPANIES, INTERNATIONAL_TYPE)
international_final_df.to_excel('./data/international.xlsx', index=False)

international_final_df

Unnamed: 0,Date,Message,Profile,Network,Engagement,Post interaction rate,Number of comments,"Reactions, Comments & Shares",Number of Likes,Message-ID,Profile-ID,Link,type_of_company
0,2025-03-17 19:01:17,??Ontvang nu extra vroegboekkorting tot €200 é...,Corendon,INSTAGRAM,0.001174,0.001174,3,185,182,18038388467208858,1466583220,https://www.instagram.com/reel/DHT7mD7PW0A/,international
1,2025-03-16 14:00:14,"Zon, zee en eindeloze siesta’s… ????☀️ In Span...",Corendon,INSTAGRAM,0.000584,0.000584,0,92,92,18039075278448488,1466583220,https://www.instagram.com/p/DHQ0Zedsckb/,international
2,2025-03-16 09:00:39,"☀️ Welkom bij Voyage Belek Golf & Spa, waar ee...",Corendon,INSTAGRAM,0.000838,0.000838,1,132,131,18085134403543445,1466583220,https://www.instagram.com/reel/DHQSGJMB3Nl/,international
3,2025-03-15 09:00:32,Hotel Diamond Excellence Resort & Spa in Side ...,Corendon,INSTAGRAM,0.001041,0.001041,0,164,164,17842557165447487,1466583220,https://www.instagram.com/p/DHNtTrRsRIj/,international
4,2025-03-14 13:01:48,Last-Minute Deal: €70 Korting op het Ladies of...,Corendon,INSTAGRAM,0.001612,0.001612,5,254,249,18046848881361680,1466583220,https://www.instagram.com/p/DHLkIA1sQxy/,international
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1626,2014-08-05 09:33:06,Beleef jij deze zomer een ultiem #elizamomentj...,Eliza was here,INSTAGRAM,-,-,5,20,15,17842862974034381,1258146380,https://www.instagram.com/p/rT8t-nlZRg/,international
1627,2014-08-04 16:54:05,Lig jij deze zomer hier? Je hebt nog tot morge...,Eliza was here,INSTAGRAM,-,-,4,19,15,17841991168034381,1258146380,https://www.instagram.com/p/rSKY5wFZcT/,international
1628,2014-07-30 17:08:00,Puur genieten op een ligbedje bij het zwembad....,Eliza was here,INSTAGRAM,-,-,1,16,15,17842038979034381,1258146380,https://www.instagram.com/p/rFUAgoFZV-/,international
1629,2014-07-29 16:28:38,"Ken je dat gevoel? Je haren in de wind, je voe...",Eliza was here,INSTAGRAM,-,-,9,22,13,17841828277034381,1258146380,https://www.instagram.com/p/rCqtQHlZRN/,international


In [6]:
local_final_df = combine_dataset(LOCAL_COMPANIES, LOCAL_TYPE)
local_final_df.to_excel('./data/local.xlsx', index=False)

local_final_df

Unnamed: 0,Date,Message,Profile,Network,Engagement,Post interaction rate,Number of comments,"Reactions, Comments & Shares",Number of Likes,Message-ID,Profile-ID,Link,type_of_company
0,2025-03-01 09:04:43,Discover Nijmegen: The Oldest City in the Neth...,Accessible Travel Netherlands,INSTAGRAM,0.008539,0.008539,0,9,9,18052737773131838,43939331,https://www.instagram.com/p/DGpqqf8o9yE/,local
1,2025-02-05 11:14:13,Amsterdam should be at the top of your travel ...,Accessible Travel Netherlands,INSTAGRAM,-,-,1,26,25,18101839216492258,43939331,https://www.instagram.com/p/DFsGZ3voYrG/,local
2,2025-01-31 15:32:04,The Veluwe ❤️\n\nExplore one of the most stunn...,Accessible Travel Netherlands,INSTAGRAM,-,-,0,5,5,18090951523529057,43939331,https://www.instagram.com/p/DFfr8IVoPZf/,local
3,2025-01-29 16:31:20,Experience the Traditional Gouda Cheese Market...,Accessible Travel Netherlands,INSTAGRAM,-,-,0,9,9,18060269164931558,43939331,https://www.instagram.com/p/DFapIbBoaj7/,local
4,2025-01-26 16:20:51,"At Accessible Travel Netherlands, we offer 10 ...",Accessible Travel Netherlands,INSTAGRAM,-,-,0,5,5,18051201320326350,43939331,https://www.instagram.com/p/DFS5jAbIHIC/,local
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2019-08-23 09:39:08,Heeft u mooie foto’s van uw eigen cruise gemaa...,Cruise Travel,INSTAGRAM,-,-,0,22,22,17868232276456526,5621753329,https://www.instagram.com/p/B1gHPoyCygq/,local
180,2019-07-15 14:54:46,Onze ambassadeur Henk van der Noort heeft een ...,Cruise Travel,INSTAGRAM,-,-,2,26,24,17847898231507730,5621753329,https://www.instagram.com/p/Bz8QXReChhs/,local
181,2019-07-01 10:31:44,Onze ambassadeur A. Prevo ging aan boord van d...,Cruise Travel,INSTAGRAM,-,-,0,23,23,18079004770055863,5621753329,https://www.instagram.com/p/BzXvIioiGI2/,local
182,2019-06-20 08:12:23,Verschillende reisbureaus op avontuur naar Ala...,Cruise Travel,INSTAGRAM,-,-,3,49,46,18049625401089614,5621753329,https://www.instagram.com/p/By7KcTpiXN4/,local


In [7]:
final_df = pd.concat([international_final_df, local_final_df])
final_df.to_excel('./final.xlsx', index=False)
final_df

Unnamed: 0,Date,Message,Profile,Network,Engagement,Post interaction rate,Number of comments,"Reactions, Comments & Shares",Number of Likes,Message-ID,Profile-ID,Link,type_of_company
0,2025-03-17 19:01:17,??Ontvang nu extra vroegboekkorting tot €200 é...,Corendon,INSTAGRAM,0.001174,0.001174,3,185,182,18038388467208858,1466583220,https://www.instagram.com/reel/DHT7mD7PW0A/,international
1,2025-03-16 14:00:14,"Zon, zee en eindeloze siesta’s… ????☀️ In Span...",Corendon,INSTAGRAM,0.000584,0.000584,0,92,92,18039075278448488,1466583220,https://www.instagram.com/p/DHQ0Zedsckb/,international
2,2025-03-16 09:00:39,"☀️ Welkom bij Voyage Belek Golf & Spa, waar ee...",Corendon,INSTAGRAM,0.000838,0.000838,1,132,131,18085134403543445,1466583220,https://www.instagram.com/reel/DHQSGJMB3Nl/,international
3,2025-03-15 09:00:32,Hotel Diamond Excellence Resort & Spa in Side ...,Corendon,INSTAGRAM,0.001041,0.001041,0,164,164,17842557165447487,1466583220,https://www.instagram.com/p/DHNtTrRsRIj/,international
4,2025-03-14 13:01:48,Last-Minute Deal: €70 Korting op het Ladies of...,Corendon,INSTAGRAM,0.001612,0.001612,5,254,249,18046848881361680,1466583220,https://www.instagram.com/p/DHLkIA1sQxy/,international
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2019-08-23 09:39:08,Heeft u mooie foto’s van uw eigen cruise gemaa...,Cruise Travel,INSTAGRAM,-,-,0,22,22,17868232276456526,5621753329,https://www.instagram.com/p/B1gHPoyCygq/,local
180,2019-07-15 14:54:46,Onze ambassadeur Henk van der Noort heeft een ...,Cruise Travel,INSTAGRAM,-,-,2,26,24,17847898231507730,5621753329,https://www.instagram.com/p/Bz8QXReChhs/,local
181,2019-07-01 10:31:44,Onze ambassadeur A. Prevo ging aan boord van d...,Cruise Travel,INSTAGRAM,-,-,0,23,23,18079004770055863,5621753329,https://www.instagram.com/p/BzXvIioiGI2/,local
182,2019-06-20 08:12:23,Verschillende reisbureaus op avontuur naar Ala...,Cruise Travel,INSTAGRAM,-,-,3,49,46,18049625401089614,5621753329,https://www.instagram.com/p/By7KcTpiXN4/,local


# Statistical testing

## Library

In [8]:
SIGNIFICANT_LEVEL = 0.05

In [9]:
class TwoSampleStatisticalTests:
    def __init__(self, first_dataset: pd.Series, second_dataset: pd.Series, significance_level=SIGNIFICANT_LEVEL):
        self._first_dataset = first_dataset
        self._second_dataset = second_dataset
        self._significance_level = significance_level
    
    # The function prove hypothesis H0: "The data is normally distributed"
    # Ha then "The data is not normally distributed"
    # I used Shapiro-Wilk’s W test for the proving
    def _check_normality(self):
       _, pvalue_first = stats.normaltest(self._first_dataset)
       _, pvalue_second = stats.normaltest(self._second_dataset)

       print(f"The result of the p-value when checking the normality for the first dataset: {pvalue_first}")
       print(f"The result of the p-value when checking the normality for the second dataset: {pvalue_second}")
       
       return pvalue_first >= SIGNIFICANT_LEVEL and pvalue_second >= SIGNIFICANT_LEVEL
    
    # The function prove hypothesis H0: "The variances of the datasets are homogeneous."
    # Ha then "The variances of the datasets are different"
    def _check_variance_homogeneity(self):
        _, pvalue = stats.levene(self._first_dataset, self._second_dataset)

        print(f"The result of the p-value when checking the variance uniform: {pvalue}")

        return pvalue >= self._significance_level
    
    def _t_test(self, is_one_tailed: bool = False) -> float:
        t_stat, p_value = stats.ttest_ind(self._first_dataset, self._second_dataset, equal_var=True)
        return p_value if not is_one_tailed else (p_value / 2 if t_stat > 0 else 1 - (p_value / 2))
    
    def _mann_whitney_test(self, is_one_tailed: bool = False) -> float:
        alternative = 'greater' if is_one_tailed else 'two-sided'
        _, p_value = mannwhitneyu(self._first_dataset, self._second_dataset, alternative=alternative)
        return p_value
    
    def test_two_numerical_samples(self, is_one_tailed: bool = False):
        is_normally_distributed = self._check_normality()
        is_variance_homogeneous = self._check_variance_homogeneity()
        
        if is_normally_distributed and is_variance_homogeneous:
            print("T-test was chosen")
            p_value = self._t_test(is_one_tailed)
        else:
            print("Mann-Whitney was chosen")
            p_value = self._mann_whitney_test(is_one_tailed)
        
        print(f"Final p-value: {p_value}")
        
        if p_value <= self._significance_level:
            print("H0 has been rejected, Ha has been accepted")
        else:
            print("H0 was not rejected")

## Local vs international companies in comments

In [10]:
international_comments = international_final_df['Number of comments']
international_comments

0       3
1       0
2       1
3       0
4       5
       ..
1626    5
1627    4
1628    1
1629    9
1630    0
Name: Number of comments, Length: 17627, dtype: int64

In [11]:
local_comments = local_final_df['Number of comments']
local_comments

0      0
1      1
2      0
3      0
4      0
      ..
179    0
180    2
181    0
182    3
183    1
Name: Number of comments, Length: 3525, dtype: int64

In [12]:
comments_test_tool = TwoSampleStatisticalTests(international_comments, local_comments)
comments_test_tool.test_two_numerical_samples(is_one_tailed=True)

The result of the p-value when checking the normality for the first dataset: 0.0
The result of the p-value when checking the normality for the second dataset: 0.0
The result of the p-value when checking the variance uniform: 0.0008172175038198599
Mann-Whitney was chosen
Final p-value: 0.0
H0 has been rejected, Ha has been accepted


## Local vs international companies in likes

Extracting the likes

In [13]:
international_likes = international_final_df['Number of Likes']
international_likes

0       182
1        92
2       131
3       164
4       249
       ... 
1626     15
1627     15
1628     15
1629     13
1630     10
Name: Number of Likes, Length: 17627, dtype: object

In [14]:
local_likes = local_final_df['Number of Likes']
local_likes

0       9
1      25
2       5
3       9
4       5
       ..
179    22
180    24
181    23
182    46
183    16
Name: Number of Likes, Length: 3525, dtype: object

Dropping the empty 

In [15]:
international_likes = international_likes.loc[international_likes.apply(lambda x: isinstance(x, int))].apply(int)
international_likes

0       182
1        92
2       131
3       164
4       249
       ... 
1626     15
1627     15
1628     15
1629     13
1630     10
Name: Number of Likes, Length: 17591, dtype: int64

In [16]:
local_likes = local_likes.loc[local_likes.apply(lambda x: isinstance(x, int))].apply(int)
local_likes

0       9
1      25
2       5
3       9
4       5
       ..
179    22
180    24
181    23
182    46
183    16
Name: Number of Likes, Length: 3466, dtype: int64

Test

In [17]:
likes_test_tool = TwoSampleStatisticalTests(international_likes, local_likes)
likes_test_tool.test_two_numerical_samples(is_one_tailed=True)

The result of the p-value when checking the normality for the first dataset: 0.0
The result of the p-value when checking the normality for the second dataset: 0.0
The result of the p-value when checking the variance uniform: 3.053322439386708e-65
Mann-Whitney was chosen
Final p-value: 0.0
H0 has been rejected, Ha has been accepted
