In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload

import process_gini as pg

In [222]:
df = pd.read_csv("../data/skylab_instagram_datathon_dataset.csv", sep=";")

df.shape

(704313, 15)

In [223]:
# information about the data before processing it -> run again after processing
df.describe(include='all')

Unnamed: 0,period,period_end_date,compset_group,compset,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,calculation_type,followers,pictures,videos,comments,likes
count,704313,704313,704313,704313,704313,676558,458589,676558,458589,704313,579258.0,695803.0,684349.0,695343.0,695977.0
unique,1,455,20,54,706,423,26,401,30,1,,,,,
top,Weekly,2023-07-08,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,All Brands,LVMH Moet Hennessy Louis Vuitton SE,United States of America,LVMH Moet Hennessy Louis Vuitton SE,New York Stock Exchange,Metric Value,,,,,
freq,704313,1635,216241,113744,27755,27576,193509,27576,132071,704313,,,,,
mean,,,,,,,,,,,13424320.0,344.27254,61.244426,30432.46,2163189.0
std,,,,,,,,,,,110697500.0,2777.396873,564.46848,276645.9,18031930.0
min,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,,,,,,,301337.2,19.0,2.0,371.0,17277.0
50%,,,,,,,,,,,1104144.0,44.0,7.0,1435.0,93184.0
75%,,,,,,,,,,,4479325.0,122.0,20.0,5872.0,457747.0


# Process data

## Data cleaing

### Replace

In [224]:
# READJUST STRINGS

# remove leading and trailing whitespaces and convert to lowercase
df = df.map(lambda x: x.strip().lower() if isinstance(x, str) else x)

# replace all double whitespaces with single whitespaces
df = df.map(lambda x: x.replace("  ", " ") if isinstance(x, str) else x)

In [225]:
# Converting the 'period_end_of_week' column to datetime
df['period_end_date'] = pd.to_datetime(df['period_end_date'])

In [226]:
# REPLACE DOMICILE_COUNTRY_NAME

# replace the china;hong kong with hong kong
df['domicile_country_name'] = df['domicile_country_name'].replace('china;hong kong', 'hong kong')

# remove the sign ";" in column domicile_country_name
df['domicile_country_name'] = df['domicile_country_name'].str.replace(';', '')

# if empty, fill with nan
df['domicile_country_name'] = df['domicile_country_name'].replace('', np.nan)

In [227]:
# REPLACE PRIMARY_EXCHANGE_NAME

# remove the sign ";" in column domicile_country_name
df['primary_exchange_name'] = df['primary_exchange_name'].str.replace(';', '')

# if empty, fill with nan
df['primary_exchange_name'] = df['primary_exchange_name'].replace('', np.nan)

In [228]:
# REPLACE ULTIMATE_PARENT_LEGAL_ENTITY_NAME
df['ultimate_parent_legal_entity_name'] = df['ultimate_parent_legal_entity_name'].replace('Anheuser-Busch;Anheuser-Busch', 'Anheuser-Busch')

### Remove

In [229]:
# Remove "All_Brands" and "Don't Use it"
df = df[df["business_entity_doing_business_as_name"] != "all brands"]
# TODO: Remove "Don't Use it"

In [230]:
# remove unnecessary columns
# Here: 'period', 'calculation_type'
df = df.drop(columns=['period', 'calculation_type'])

In [231]:
# certain entries have exact data except of "compset"
# we want to take the union of all of them

grouping_columns = [col for col in df.columns if col != 'compset']

df.fillna('Group_Null', inplace=True)
result = df.groupby(grouping_columns).agg({'compset': lambda x: set(x)}).reset_index()
df = result
df.replace('Group_Null', np.nan, inplace=True)


# df.describe(include='all')

# to test
#result[(result["business_entity_doing_business_as_name"] == "24S") & (result["period_end_date"] == "2017-05-13")]
#result[(result["business_entity_doing_business_as_name"] == "dd's Discounts") & (result["period_end_date"] == "2023-09-09")]
#result[(result["business_entity_doing_business_as_name"] == "dd's Discounts") & (result["period_end_date"] == "2023-09-16")]

  df.fillna('Group_Null', inplace=True)
  df.replace('Group_Null', np.nan, inplace=True)


In [232]:
result.shape

(298040, 13)

## Add features

In [233]:
# split the date into year, month, day
df['Year'] = df['period_end_date'].dt.year
df['Month'] = df['period_end_date'].dt.month
df['Day'] = df['period_end_date'].dt.day

In [234]:
# get the difference between the current date and the previous date
df = df.sort_values(by=['business_entity_doing_business_as_name', 'period_end_date'])
df['date_diff_prev'] = df['period_end_date'].diff().dt.days
df['date_diff_prev'] = df['date_diff_prev'].fillna(7)

In [235]:
#df[df['date_diff_prev'] == 0]

In [236]:
# total involvement
df["total_involvement"] = df["comments"] + df["likes"]
df["total_company_activity"] = df["pictures"] + df["videos"]

df["conversion_rate_total"] = df["total_involvement"] / df["followers"]

# COntent type
df["ratio_of_videos"] = df["videos"] / (df["pictures"] + df["videos"])
df["ratio_of_pictures"] = df["pictures"] / (df["pictures"] + df["videos"])

# ASSUMTION: we only like videos / photos from this week
df["likes_per_picture"] = df["likes"] / df["pictures"]
df["likes_per_video"] =   df["likes"] / df["videos"]
df["comments_per_picture"] =  df["comments"] / df["pictures"] 
df["comments_per_video"] =   df["comments"] / df["videos"]

# take care of zeros
df.loc[df["pictures"] + df["videos"] == 0, "ratio_of_videos"] = np.nan
df.loc[df["pictures"] + df["videos"] == 0, "ratio_of_pictures"] = np.nan

df.loc[df["pictures"] == 0, "likes_per_picture"] = np.nan
df.loc[df["videos"] == 0, "likes_per_video"] = np.nan
df.loc[df["pictures"] == 0, "comments_per_picture"] = np.nan
df.loc[df["videos"] == 0, "comments_per_video"] = np.nan

In [246]:
# ADD GINI
reload(pg)

df_gini = pg.process_gini(df)
df = pd.merge(df, df_gini, left_on=['domicile_country_name', 'Year'], right_on=['Country Name', 'Year'], how="left")

In [247]:
result

Unnamed: 0,period_end_date,compset_group,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,followers,pictures,videos,...,total_company_activity,conversion_rate_total,ratio_of_videos,ratio_of_pictures,likes_per_picture,likes_per_video,comments_per_picture,comments_per_video,Country Name,Gene Index
0,2017-05-06,apparel retail,24s,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,,,,...,,,,,,,,,france,31.6
1,2017-05-13,apparel retail,24s,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,,6.0,3.0,...,9.0,,0.333333,0.666667,294.166667,588.333333,9.500000,19.000000,france,31.6
2,2017-05-20,apparel retail,24s,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,,6.0,3.0,...,9.0,,0.333333,0.666667,294.166667,588.333333,9.500000,19.000000,france,31.6
3,2017-05-27,apparel retail,24s,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,,6.0,3.0,...,9.0,,0.333333,0.666667,294.166667,588.333333,9.500000,19.000000,france,31.6
4,2017-06-03,apparel retail,24s,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,,24.0,3.0,...,27.0,,0.111111,0.888889,163.416667,1307.333333,4.541667,36.333333,france,31.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298035,2023-08-19,luxury & premium & mainstream,zenith,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,810478.0,22.0,3.0,...,25.0,0.074188,0.120000,0.880000,2703.590909,19826.333333,29.500000,216.333333,france,
298036,2023-08-26,luxury & premium & mainstream,zenith,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,811247.0,23.0,2.0,...,25.0,0.061547,0.080000,0.920000,2153.956522,24770.500000,16.913043,194.500000,france,
298037,2023-09-02,luxury & premium & mainstream,zenith,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,811677.0,21.0,3.0,...,24.0,0.064762,0.125000,0.875000,2484.523810,17391.666667,18.619048,130.333333,france,
298038,2023-09-09,luxury & premium & mainstream,zenith,lvmh moet hennessy louis vuitton se,france,lvmh moet hennessy louis vuitton se,euronext paris,812105.0,21.0,3.0,...,24.0,0.065036,0.125000,0.875000,2494.809524,17463.666667,20.238095,141.666667,france,


In [239]:
df.describe()

Unnamed: 0,period_end_date,followers,pictures,videos,comments,likes,Year,Month,Day,date_diff_prev,total_involvement,total_company_activity,conversion_rate_total,ratio_of_videos,ratio_of_pictures,likes_per_picture,likes_per_video,comments_per_picture,comments_per_video
count,298040,236313.0,294176.0,288438.0,293958.0,294288.0,298040.0,298040.0,298040.0,298040.0,293958.0,288326.0,232915.0,283921.0,283921.0,287213.0,246854.0,286959.0,246689.0
mean,2019-07-29 07:54:48.170714880,3795494.0,119.606086,24.83187,9528.576,630269.7,2019.083328,6.415666,15.666827,0.007821,640505.3,146.52193,0.189906,0.179683,0.820317,7431.102,72851.43,100.068852,915.4792
min,2015-01-03 00:00:00,0.0,0.0,0.0,0.0,0.0,2015.0,1.0,1.0,-3178.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017-07-08 00:00:00,199747.0,16.0,2.0,260.0,9426.0,2017.0,4.0,8.0,7.0,10104.25,22.0,0.044883,0.048077,0.75,326.8333,1700.13,8.943095,50.57143
50%,2019-08-24 00:00:00,783455.0,39.0,6.0,1100.0,56891.5,2019.0,6.0,16.0,7.0,59116.0,49.0,0.106937,0.129032,0.870968,1127.333,7155.862,24.54902,147.8571
75%,2021-09-04 00:00:00,2990059.0,102.0,19.0,4503.0,282368.5,2021.0,9.0,23.0,7.0,290549.8,126.0,0.225776,0.25,0.951923,4771.083,35821.8,72.648843,477.1818
max,2023-09-16 00:00:00,430177000.0,10498.0,1910.0,3179575.0,120689700.0,2023.0,12.0,31.0,7.0,120990700.0,11837.0,17.680733,1.0,1.0,1660860.0,34880180.0,32546.791667,1218564.0
std,,13232780.0,265.156184,72.078397,53947.51,2529721.0,2.456062,3.410693,8.787733,145.318041,2552247.0,312.947129,0.281915,0.181286,0.181286,23428.08,366837.1,425.596458,6294.418


In [240]:
sorted(list(df["domicile_country_name"].value_counts().keys()))

df["domicile_country_name"].value_counts()

domicile_country_name
united states of america                                86036
france                                                  26956
switzerland                                             13625
united kingdom of great britain and northern ireland    10884
brazil                                                   9999
japan                                                    8532
hong kong                                                8342
belgium                                                  6725
germany                                                  5248
australia                                                4004
spain                                                    3185
netherlands                                              3140
italy                                                    3033
canada                                                   2656
new zealand                                              1742
sweden                                          