# Data Preparation Notebook

<div class="alert alert-block alert-warning">
The objective of this notebook is to create the dataset, used subsequenially in the analysis & data visualizaion
</div>


1. Data standardisation
    1. text
        - lowercase
        - ltrim
2. Data cleaning
    1. encoding as missing values
    2. outlier detection
3. Feature creation
    1. conversion rate
    2. competition in the market
    3. miscellaneous
4. Data enhancement
    1. stock prices
    2. country value

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload
import os

import process_gini as pg
import plots

In [10]:
path = "../individualNotebooks/skylab_instagram_datathon_dataset.csv"  # ASY local path
# path = "../data/skylab_instagram_datathon_dataset.csv" # gen
df = pd.read_csv(path, sep=";")
# df = pd.read_csv(path.replace("/", os.sep), sep=";") # for Windows

# Process data

## Data standardisation

In [11]:
# READJUST STRINGS

# remove leading and trailing whitespaces and convert to lowercase
df = df.map(lambda x: x.strip().lower() if isinstance(x, str) else x)

# replace all double whitespaces with single whitespaces
df = df.map(lambda x: x.replace("  ", " ") if isinstance(x, str) else x)

AttributeError: 'DataFrame' object has no attribute 'map'

In [12]:
# Converting the 'period_end_of_week' column to datetime
df['period_end_date'] = pd.to_datetime(df['period_end_date'])

In [13]:
# REPLACE DOMICILE_COUNTRY_NAME

# replace the china;hong kong with hong kong
df['domicile_country_name'] = df['domicile_country_name'].replace('china;hong kong', 'hong kong')

# remove the sign ";" in column domicile_country_name
df['domicile_country_name'] = df['domicile_country_name'].str.replace(';', '')

# if empty, fill with nan
df['domicile_country_name'] = df['domicile_country_name'].replace('', np.nan)

In [14]:
# REPLACE PRIMARY_EXCHANGE_NAME

# remove the sign ";" in column domicile_country_name
df['primary_exchange_name'] = df['primary_exchange_name'].str.replace(';', '')

# if empty, fill with nan
df['primary_exchange_name'] = df['primary_exchange_name'].replace('', np.nan)

In [15]:
# REPLACE ULTIMATE_PARENT_LEGAL_ENTITY_NAME
df['ultimate_parent_legal_entity_name'] = df['ultimate_parent_legal_entity_name'].replace('Anheuser-Busch;Anheuser-Busch', 'Anheuser-Busch')

### Remove

In [9]:
# Remove "All_Brands" and "Don't Use it"
df = df[df["business_entity_doing_business_as_name"] != "all brands"]
# TODO: Remove "Don't Use it"

In [10]:
# remove unnecessary columns
# Here: 'period', 'calculation_type'
df = df.drop(columns=['period', 'calculation_type'])

In [11]:
# certain entries have exact data except of "compset"
# we want to take the union of all of them

grouping_columns = [col for col in df.columns if col != 'compset']

df.fillna('Group_Null', inplace=True)
result = df.groupby(grouping_columns).agg({'compset': lambda x: set(x)}).reset_index()
df = result
df.replace('Group_Null', np.nan, inplace=True)


# df.describe(include='all')

# to test
#result[(result["business_entity_doing_business_as_name"] == "24S") & (result["period_end_date"] == "2017-05-13")]
#result[(result["business_entity_doing_business_as_name"] == "dd's Discounts") & (result["period_end_date"] == "2023-09-09")]
#result[(result["business_entity_doing_business_as_name"] == "dd's Discounts") & (result["period_end_date"] == "2023-09-16")]

  df.fillna('Group_Null', inplace=True)
  df.replace('Group_Null', np.nan, inplace=True)


In [12]:
result.shape

(298040, 13)

## Add features

In [13]:
# split the date into year, month, day
df['Year'] = df['period_end_date'].dt.year
df['Month'] = df['period_end_date'].dt.month
df['Day'] = df['period_end_date'].dt.day
df['Weekday'] = df['date'].dt.dayofweek

In [14]:
# get the difference between the current date and the previous date
df = df.sort_values(by=['business_entity_doing_business_as_name', 'period_end_date'])
df['date_diff_prev'] = df['period_end_date'].diff().dt.days
df['date_diff_prev'] = df['date_diff_prev'].fillna(7)

In [15]:
# total involvement
df["total_involvement"] = df["comments"] + df["likes"]
df["total_company_activity"] = df["pictures"] + df["videos"]

df["conversion_rate_total"] = df["total_involvement"] / df["followers"]

# COntent type
df["ratio_of_videos"] = df["videos"] / (df["pictures"] + df["videos"])
df["ratio_of_pictures"] = df["pictures"] / (df["pictures"] + df["videos"])

# ASSUMTION: we only like videos / photos from this week
df["likes_per_picture"] = df["likes"] / df["pictures"]
df["likes_per_video"] =   df["likes"] / df["videos"]
df["comments_per_picture"] =  df["comments"] / df["pictures"] 
df["comments_per_video"] =   df["comments"] / df["videos"]

# take care of zeros
df.loc[df["pictures"] + df["videos"] == 0, "ratio_of_videos"] = np.nan
df.loc[df["pictures"] + df["videos"] == 0, "ratio_of_pictures"] = np.nan

df.loc[df["pictures"] == 0, "likes_per_picture"] = np.nan
df.loc[df["videos"] == 0, "likes_per_video"] = np.nan
df.loc[df["pictures"] == 0, "comments_per_picture"] = np.nan
df.loc[df["videos"] == 0, "comments_per_video"] = np.nan

In [16]:
# ADD GINI
reload(pg)

df_gini = pg.process_gini(df)
df = pd.merge(df, df_gini, left_on=['domicile_country_name', 'Year'], right_on=['Country Name', 'Year'], how="left")

In [17]:
df.columns

Index(['period_end_date', 'compset_group',
       'business_entity_doing_business_as_name', 'legal_entity_name',
       'domicile_country_name', 'ultimate_parent_legal_entity_name',
       'primary_exchange_name', 'followers', 'pictures', 'videos', 'comments',
       'likes', 'compset', 'Year', 'Month', 'Day', 'date_diff_prev',
       'total_involvement', 'total_company_activity', 'conversion_rate_total',
       'ratio_of_videos', 'ratio_of_pictures', 'likes_per_picture',
       'likes_per_video', 'comments_per_picture', 'comments_per_video',
       'Country Name', 'Gene Index'],
      dtype='object')