In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [53]:
df = pd.read_csv("../data/skylab_instagram_datathon_dataset.csv", sep=";")

df.shape

(704313, 15)

In [54]:
# information about the data before processing it -> run again after processing
df.describe(include='all')

Unnamed: 0,period,period_end_date,compset_group,compset,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,calculation_type,followers,pictures,videos,comments,likes
count,704313,704313,704313,704313,704313,676558,458589,676558,458589,704313,579258.0,695803.0,684349.0,695343.0,695977.0
unique,1,455,20,54,706,423,26,401,30,1,,,,,
top,Weekly,2023-07-08,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,All Brands,LVMH Moet Hennessy Louis Vuitton SE,United States of America,LVMH Moet Hennessy Louis Vuitton SE,New York Stock Exchange,Metric Value,,,,,
freq,704313,1635,216241,113744,27755,27576,193509,27576,132071,704313,,,,,
mean,,,,,,,,,,,13424320.0,344.27254,61.244426,30432.46,2163189.0
std,,,,,,,,,,,110697500.0,2777.396873,564.46848,276645.9,18031930.0
min,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,,,,,,,301337.2,19.0,2.0,371.0,17277.0
50%,,,,,,,,,,,1104144.0,44.0,7.0,1435.0,93184.0
75%,,,,,,,,,,,4479325.0,122.0,20.0,5872.0,457747.0


# Process data

## Data cleaing

### Replace

In [55]:
df = df.map(lambda x: x.strip().lower() if isinstance(x, str) else x)

In [56]:
# Converting the 'period_end_of_week' column to datetime
df['period_end_date'] = pd.to_datetime(df['period_end_date'])

In [57]:
# REPLACE DOMICILE_COUNTRY_NAME

# replace the china;hong kong with hong kong
df['domicile_country_name'] = df['domicile_country_name'].replace('china;hong kong', 'hong kong')

# remove the sign ";" in column domicile_country_name
df['domicile_country_name'] = df['domicile_country_name'].str.replace(';', '')

# if empty, fill with nan
df['domicile_country_name'] = df['domicile_country_name'].replace('', np.nan)

In [None]:
# REPLACE DOMICILE_COUNTRY_NAME

# replace the china;hong kong with hong kong
df['domicile_country_name'] = df['domicile_country_name'].replace('china;hong kong', 'hong kong')

# remove the sign ";" in column domicile_country_name
df['domicile_country_name'] = df['domicile_country_name'].str.replace(';', '')

# if empty, fill with nan
df['domicile_country_name'] = df['domicile_country_name'].replace('', np.nan)

In [None]:
# REPLACE PRIMARY_EXCHANGE_NAME

# remove the sign ";" in column domicile_country_name
df['primary_exchange_name'] = df['primary_exchange_name'].str.replace(';', '')

# if empty, fill with nan
df['primary_exchange_name'] = df['primary_exchange_name'].replace('', np.nan)

### Remove

In [58]:
# Remove "All_Brands" and "Don't Use it"
df = df[df["business_entity_doing_business_as_name"] != "All Brands"]
# TODO: Remove "Don't Use it"

In [59]:
# remove unnecessary columns
# Here: 'period', 'calculation_type'
df = df.drop(columns=['period', 'calculation_type'])

In [60]:
# certain entries have exact data except of "compset"
# we want to take the union of all of them

grouping_columns = [col for col in df.columns if col != 'compset']

df.fillna('Group_Null', inplace=True)
result = df.groupby(grouping_columns).agg({'compset': lambda x: set(x)}).reset_index()
df = result
df.replace('Group_Null', np.nan, inplace=True)


# df.describe(include='all')

# to test
#result[(result["business_entity_doing_business_as_name"] == "24S") & (result["period_end_date"] == "2017-05-13")]
#result[(result["business_entity_doing_business_as_name"] == "dd's Discounts") & (result["period_end_date"] == "2023-09-09")]
#result[(result["business_entity_doing_business_as_name"] == "dd's Discounts") & (result["period_end_date"] == "2023-09-16")]

  df.fillna('Group_Null', inplace=True)
  df.replace('Group_Null', np.nan, inplace=True)


In [61]:
result.shape

(325732, 13)

## Add features

In [62]:
# split the date into year, month, day
df['Year'] = df['period_end_date'].dt.year
df['Month'] = df['period_end_date'].dt.month
df['Day'] = df['period_end_date'].dt.day

In [63]:
# get the difference between the current date and the previous date
df = df.sort_values(by=['business_entity_doing_business_as_name', 'period_end_date'])
df['date_diff_prev'] = df['period_end_date'].diff().dt.days
df['date_diff_prev'] = df['date_diff_prev'].fillna(7)

In [64]:
#df[df['date_diff_prev'] == 0]

In [66]:
df.describe(include='all')

Unnamed: 0,period_end_date,compset_group,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,followers,pictures,videos,comments,likes,compset,Year,Month,Day,date_diff_prev
count,325732,325732,325732,298040,198093,298040,198433,261595.0,321826.0,315947.0,321608.0,321938.0,325732,325732.0,325732.0,325732.0,325732.0
unique,,20,706,423,22,401,30,,,,,,116,,,,
top,,apparel retail,all brands,lvmh moet hennessy louis vuitton se,united states of america,lvmh moet hennessy louis vuitton se,new york stock exchange,,,,,,{beauty & boutique},,,,
freq,,74801,27692,9632,86036,9632,58937,,,,,,44266,,,,
mean,2019-07-22 20:01:03.306030336,,,,,,,23529770.0,606.507936,110.170576,52121.07,3717640.0,,2019.065695,6.413806,15.668399,0.007156
min,2015-01-03 00:00:00,,,,,,,0.0,0.0,0.0,0.0,0.0,,2015.0,1.0,1.0,-3178.0
25%,2017-06-24 00:00:00,,,,,,,227757.0,18.0,2.0,303.0,11254.0,,2017.0,3.0,8.0,7.0
50%,2019-08-17 00:00:00,,,,,,,943314.0,46.0,8.0,1362.0,70553.5,,2019.0,6.0,16.0,7.0
75%,2021-09-04 00:00:00,,,,,,,4225668.0,143.0,26.0,6480.0,403165.0,,2021.0,9.0,23.0,7.0
max,2023-09-16 00:00:00,,,,,,,3502565000.0,141746.0,35905.0,17320460.0,712071100.0,,2023.0,12.0,31.0,7.0
