#Complex visualisations

## basic imports

In [None]:
import pandas as pd
import numpy as np
#connect to drive
from google.colab import drive

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Sankey diagram

In [None]:
## generate sankey diagram dictionary values
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = px.colors.qualitative.Pastel
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp

    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))

    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum

    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()

    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

    # creating the sankey diagram
    node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        )
    link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count'],
        )
    return node,link

In [None]:
# data is from : https://www.kaggle.com/datasets/therohk/ireland-historical-news?datasetId=30661
drive.mount('/content/drive')
path = "/content/drive/My Drive/ms projects and courses/ireland-news-headlines.csv"
df = pd.read_csv(path)
df=df.rename(columns={'headline_category':'label'})
print("Shape: ",df.shape)
print(df.isna().sum())
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape:  (1611495, 3)
publish_date     0
label            0
headline_text    7
dtype: int64


Unnamed: 0,publish_date,label,headline_text
0,19960102,news,UUP sees possibility of voting Major out
1,19960102,news,Pubs targeted as curbs on smoking are extended
2,19960102,news,Papers reveal secret links with O'Neill cabinet
3,19960102,news,Domestic chaos as Italy takes EU presidency
4,19960102,news,Learning about the star to which we owe life


In [None]:
#add index col
df['id'] = df.index
df_label=df[['id','label']]
#split label to columns
df_label[['label_level_1', 'label_level_2', 'label_level_3', 'label_level_4']] = df_label['label'].str.split('.', expand=True)
#create counters for label
label_counts=df_label[['label_level_1', 'label_level_2', 'label_level_3', 'label_level_4','id']].groupby(by=["label_level_1", 'label_level_2', 'label_level_3', 'label_level_4'], dropna=False).count().reset_index().rename(columns={'id':'count'})
# present all labels count
label_counts = label_counts.replace({np.nan: None})
label_counts.sort_values(by=['count'],ascending=False).head(105)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[['label_level_1', 'label_level_2', 'label_level_3', 'label_level_4']] = df_label['label'].str.split('.', expand=True)


Unnamed: 0,label_level_1,label_level_2,label_level_3,label_level_4,count
92,news,,,,580242
102,sport,,,,158683
24,business,,,,111435
94,opinion,letters,,,79276
95,opinion,,,,49946
...,...,...,...,...,...
45,lifestyle,food,restaurant,,203
84,news,technology,,,191
15,business,markets,bonds,,127
39,lifestyle,abroad,working-abroad,,116


In [None]:
# get sankey dictionary values and visualize all labels hirarchy
node,link=genSankey(label_counts,cat_cols=['label_level_1', 'label_level_2', 'label_level_3', 'label_level_4'],value_cols='count',title='Sankey Diagram')

In [None]:
fig = go.Figure(data=[go.Sankey(
    node = node,
    link = link)])

fig.update_layout(title_text="Sankey Diagram for labels hierarchy", font_size=14,width=1000,
    height=1500)
fig.add_annotation(dict(font=dict(color='black',size=15),
                                        x=0,
                                        y=0,
                                        showarrow=False,
                                        text="blue - first label, yellow - second label, orange - third label, purple - fourth label",
                                        xanchor='left',
                                        xref="paper",
                                        yref="paper"))
fig.show()

# Smoothing function

helps to smooth data for better visualisation

In [None]:
def exponential_smoothing(alpha,beta,df_series):
  import numpy as np
  ts=df_series.to_numpy()
  f = [np.nan]
  # The forecast for the second point is defined as the first data point
  f.append(ts[0]) # first in array
  f.append((alpha)*f[-1]+(1-alpha)*ts[1]) #second in arary
  # We will now loop through the time serie and update our forecast
  for t in range(2,(len(ts)-1)):
    f.append((beta)*f[-2]+(alpha)*f[-1]+(1-alpha-beta)*ts[t])

  return f

# Confidence interval bar chart function for AB Testing

In [None]:
def confidence_interval_chart(t_st,df,clusters_cols,value_col,title,bar_color,confidence_interval_color):
  import plotly.graph_objects as go
  import numpy as np
  #calc confidence level
  df2 = df.groupby(clusters_cols).agg({value_col: ['mean', 'std','count']})
  df2.columns = ['value_mean', 'value_std','value_count']
  df2=df2.reset_index()
  df2['value_count'] = df2['value_count'].astype(float)
  df2['value_count_count_sqrt']= np.sqrt(df2['value_count'])
  df2['mean_abs']=df2['value_mean'].abs()
  df2=df2.sort_values(by=clusters_cols,ascending=True)
  df2['confidence_interval']=t_st*(df2['value_std']/df2['value_count_count_sqrt']) #1.96 = Z 0.95 #1.65=z 0.9 # t 0.95,38 freedom d =1.687
  df2['clusters'] = df2[clusters_cols].apply(lambda x: '_'.join(x), axis=1)
  #create chart
  fig = go.Figure()
  fig.add_trace(go.Bar(
  x=df2['clusters'], y=df2['mean_abs'],marker_color=bar_color,
  error_y=dict(type='data', array=df2['confidence_interval'],color=confidence_interval_color)
  ))
  fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

  fig.update_layout({
  'plot_bgcolor': 'rgba(0, 0, 0, 0)',
  'paper_bgcolor': 'rgba(0, 0, 0, 0)',
  })


  fig.update_layout(title_text=title)
  fig.show()

  print(df2[['clusters','confidence_interval','value_mean','mean_abs']])