In [65]:
import sys
sys.path.append('..')
import pandas as pd
import random
from utils.Constants import Datasets
import numpy as np
datasets = [dataset.name for dataset in Datasets]
metrics = [
#stage1
'total_words_per_doc',
'avg_word_length', 
'total_num_sent',
'avg_sent_length',
'token_type_ratio',
'symbol_word_ratio',
'num_non_alphabet_words',
#stage2
'num_stopwords_per_doc',
'num_abbreviations_per_doc',
'num_exact_duplicates',
'num_near_duplicates',
]
num_samples = 2000

In [117]:
fixed_ids = [str(uuid.uuid4()) for _ in range(num_samples * len(datasets))]

In [120]:
#have same ids throughout
# import pandas as pd
# for n in ['PCA', 't-SNE', 'UMAP', 'MDS']:
#     df = pd.read_csv(f"../datasets/{n}_dataset.csv")
#     df['id'] = fixed_ids
#     df.to_csv(f'../datasets/{n}_dataset.csv', index=False)

# df = pd.read_csv(f"../datasets/metrics.csv")
# df['id'] = fixed_ids
# df.to_csv(f'../datasets/metrics.csv', index=False)

In [109]:
#embeddings dim reduced
points = pd.DataFrame()
for i, d1 in enumerate(datasets):
    ch1 = random.choice([1,2,3])
    if ch1==1:
        d = pd.DataFrame(np.random.randn(num_samples, 2) - i, columns=['pc1', 'pc2'])
    elif ch1==2:
        d = pd.DataFrame( np.random.default_rng().gamma(shape=np.random.randint(len(datasets)),\
             size=(num_samples, 2))+ i, columns=['pc1', 'pc2'])
    else:
        d =  pd.DataFrame(np.random.default_rng().laplace(size=(num_samples, 2)) - i, columns=['pc1', 'pc2'])
    d['name'] = d1
    points = pd.concat((points, d))
import uuid
points['id'] = fixed_ids
points['lang'] = 'de'

In [110]:
#generate dummy metrics for datasets
all_metrics = pd.DataFrame()
for j, dataset in enumerate(datasets):
    res = []
    for i, metric in enumerate(metrics):
            res.append(np.random.random_sample((num_samples, 1)) * np.random.randint(1, 10))
    metrics_for_dataset = np.concatenate(res, axis=1)
    all_metrics = pd.concat((all_metrics, pd.DataFrame(metrics_for_dataset, columns=metrics)))
all_metrics['id'] = points['id']
all_metrics['name'] = points[points.id==points.id]['name']
all_metrics['lang'] = points[points.id==points.id]['lang']

In [111]:
points.to_csv('../datasets/MDS_dataset.csv', index=False)
all_metrics.to_csv('../datasets/metrics.csv', index=False)

In [None]:
#generate composition details 
import pandas as pd
import plotly.graph_objs as go
df = pd.read_csv("../datasets/dataset.csv")
comp = df.groupby(by=['name', 'lang']).count().reset_index()[['name', 'lang', 'id']]
comp.columns=['name', 'lang', 'count']
comp.to_csv('../datasets/global_size.csv', index=False)

In [None]:
#generate dummy bias data
categories = ['Gender','Ethnicity','Religion',
              'Race', 'Color']
bias_data = []
for name in datasets:
  bias_data.append([name] + [np.random.rand() for i in range(len(categories))])
bias = pd.DataFrame(bias_data, columns=['name'] + categories)
bias.to_csv('../datasets/bias.csv')

In [100]:
#trying distribution plot
import pandas as pd

import plotly.graph_objs as go
import plotly.express as px
df1 = pd.read_csv("../datasets/dataset.csv")
df2 = pd.read_csv("../datasets/metrics.csv")

In [70]:
import plotly.express as px
import pandas as pd
df = pd.read_csv('../datasets/dataset.csv')

In [71]:
#testing fixed colors
fig = px.scatter(df, x='pc1', y='pc2', color='name', hover_data=['id'],\
         color_discrete_map={
            'DATASET_1':'rgb(229, 134, 6)',
            'DATASET_2':'rgb(93, 105, 177)',
            'DATASET_3':'rgb(82, 188, 163)',
             'DATASET_4':'rgb(153, 201, 69)',
         })
fig.update_xaxes(range=[-7,7])
fig.update_yaxes(range=[-7,7])

In [72]:
#visualizing decision boundaries
merged_df = df1.merge(df2, on='id')

In [None]:
#without decision boundaries
fig = px.scatter(
        merged_df, x="pc1", y="pc2", 
        color="classifier", color_continuous_scale='rdgy')
fig.update_traces(marker_size=8, marker_line_width=.5)


fig.show()

In [77]:
#example boundary
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import numpy as np
h = .2
data = load_iris()
X = data.data[:, :2]
y = data.target
X = StandardScaler().fit_transform(X)
trees = RandomForestClassifier(max_depth=4, 
                               n_estimators=20, 
                               random_state=0)
trees.fit(X, y)


x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h)
                     , np.arange(y_min, y_max, h))
y_ = np.arange(y_min, y_max, h)



svm = svm.SVC()
svm.fit(X,y)

Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
print(Z.shape)
Z = Z.reshape(xx.shape)
trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale='geyser',
                  showscale=False)

trace2 = go.Scatter(x=X[:, 0], y=X[:, 1], 
                    mode='markers',
                    showlegend=False,
                    marker=dict(size=10,
                                color=y, 
                                colorscale='geyser',
                                line=dict(color='black', width=1))
                    )
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)

(1216,)


In [75]:
#visualize classifier for our dummy data
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm


h = .5
num_points = 1000
X, lab = pd.concat([merged_df.pc1[:num_points],merged_df.pc2[:num_points]], axis=1), np.array([np.random.randint(2) for _ in range(num_points)])

X = StandardScaler().fit_transform(X)
trees = RandomForestClassifier(max_depth=40, n_estimators=5)
trees = AdaBoostClassifier()
trees.fit(X, y)

# svm = svm.SVC()
# svm.fit(X,y)

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h)
                    , np.arange(y_min, y_max, h))
y_ = np.arange(y_min, y_max, h)


Z = trees.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)



import plotly.graph_objs as go
color_scale = 'speed'
trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                colorscale=color_scale,
                showscale=False)

trace2 = go.Scatter(x=merged_df.pc1[:num_points], y=merged_df.pc2[:num_points], 
                    mode='markers',
                    showlegend=False,
                    marker=dict(size=10,
                                color=lab, 
                                colorscale=color_scale,
                                line=dict(color='black', width=.4))
                    )
            
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.show()

ValueError: Found input variables with inconsistent numbers of samples: [1000, 150]

In [None]:
#persist boundary visualization, just display points each time
fig.write_json('boundary.json')
import plotly
f1 = plotly.io.read_json('../boundary.json')

trace2 = go.Scatter(x=pc1, y=pc2, 
                    mode='markers',
                    showlegend=False,
                    marker=dict(size=10,
                                color=lab, 
                                colorscale=color_scale,
                                line=dict(color='black', width=.4))
                    )
f1.add_trace(trace2)
f1.update_layout(width=800)
                

In [None]:
#topics data
from plotly.subplots import make_subplots
import plotly.graph_objects as go
color_discrete_map={
            'DATASET_1':'rgb(229, 134, 6)',
            'DATASET_2':'rgb(93, 105, 177)',
            'DATASET_3':'rgb(82, 188, 163)',
             'DATASET_4':'rgb(153, 201, 69)'}
def topic_plot(num_datasets):
    num_topics = 10
    num_rows = (num_datasets//2 + num_datasets%2)
    fig = make_subplots(rows=num_rows, cols=2)
    for i in range(1, num_rows+1):
        for j in range(1, 3):
            if not num_datasets:
                break
            fig.add_trace(
                    go.Bar(
                        x=[np.random.rand() for _ in range(num_topics)],
                        y=[f'topic_{i}' for i in range(num_topics)],
                        orientation='h',
                        name='',
                        marker_color='rgb(229, 134, 6)'
                        ),
                row=i, col=j
            )
            num_datasets-=1
    fig.update_layout(bargap=0.17, width=1200, height=900)
    return fig
topic_plot(1)

In [None]:
#visualize bias 
import plotly.graph_objects as go
df = pd.read_csv('../datasets/bias.csv')
def get_radar_plot(datasets):
  fig = go.Figure()
  for name in datasets:
    fig.add_trace(go.Scatterpolar(
        r=df[df.name==name].values[0,2:],
        theta=df.columns[2:],
        fill='toself',
        name=name,
  ))
  fig.update_layout(
    polar=dict(
      radialaxis=dict(
        visible=True,
        range=[0, 1]
      )),
    showlegend=True
  )

  fig.show()
get_radar_plot(['DATASET_1' ])

In [None]:
#dummy toxic words
f = open('../datasets/toxic_words.txt', 'r').read()
raw_words = f.split()

In [None]:
import pandas as pd
import numpy as np
toxic_words = pd.DataFrame({}, columns=['word', 'score'])
toxic_words['word'] = raw_words
toxic_words['score'] = np.random.rand(len(raw_words))
toxic_words = toxic_words.round(2)
toxic_words.to_csv('toxic_words.csv')

In [None]:
toxic_words.sort_values(by='score', ascending=False, inplace=True)

In [None]:
#visualizing table
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(
  columnorder = [1,2],
  columnwidth = [40,40],
  header = dict(
    values = [['<b>Word</b>'],
              ['<b>Confidence Score</b>']],
    line_color='darkslategray',
    fill_color='grey',
    align=['center','center'],
    font=dict(color='white', size=20),
    height=40
  ),
  cells=dict(
    values=[toxic_words.word, toxic_words.score],
    line_color='darkslategray',
    fill=dict(color=['white', 'white']),
    align=['center', 'center'],
    font_size=20,
    height=40)
    )
])
fig.show()