In [2]:
import sys
sys.path.append('..')
import pandas as pd
from Constants import Datasets
import numpy as np
datasets = [dataset.name for dataset in Datasets]
metrics = [
#stage1
'total_words_per_doc',
'avg_word_length', 
'total_num_sent',
'avg_sent_length',
'token_type_ratio',
'symbol_word_ratio',
'num_non_alphabet_words',
#stage2
'num_stopwords_per_doc',
'num_abbreviations_per_doc',
'num_exact_duplicates',
'num_near_duplicates',
]
num_samples = 10000

In [3]:
#embeddings dim reduced
points = pd.DataFrame()
for i, d1 in enumerate(datasets):
    d = pd.DataFrame(np.random.randn(num_samples, 2) - i, columns=['pc1', 'pc2'])
    d['name'] = d1
    points = pd.concat((points, d))
import uuid
points['id'] = [str(uuid.uuid4()) for _ in range(len(points))]
points['lang'] = 'de'

In [4]:
#generate dummy metrics for datasets
all_metrics = pd.DataFrame()
for j, dataset in enumerate(datasets):
    res = []
    for i, metric in enumerate(metrics):
            res.append(np.random.random_sample((num_samples, 1)) * np.random.randint(1, 10))
    metrics_for_dataset = np.concatenate(res, axis=1)
    all_metrics = pd.concat((all_metrics, pd.DataFrame(metrics_for_dataset, columns=metrics)))
all_metrics['id'] = points['id']
all_metrics['name'] = points[points.id==points.id]['name']
all_metrics['lang'] = points[points.id==points.id]['lang']
all_metrics['classifier'] = np.hstack((np.random.rand(20000) * .5, np.random.rand(20000)))
all_metrics['quality_lab'] = all_metrics.apply(lambda row: 1 if row['classifier']>.5 else 0, axis =1)

In [5]:
points.to_csv('../datasets/dataset.csv', index=False)
all_metrics.to_csv('../datasets/metrics.csv', index=False)

In [6]:
#generate composition details 
import pandas as pd
import plotly.graph_objs as go
df = pd.read_csv("../datasets/dataset.csv")
comp = df.groupby(by=['name', 'lang']).count().reset_index()[['name', 'lang', 'id']]
comp.columns=['name', 'lang', 'count']
comp.to_csv('../datasets/global_size.csv', index=False)

In [1]:
#trying distribution plot
import pandas as pd

import plotly.graph_objs as go
import plotly.express as px
df1 = pd.read_csv("../datasets/dataset.csv")
df2 = pd.read_csv("../datasets/metrics.csv")

In [2]:
#visualizing decision boundaries
merged_df = df1.merge(df2, on='id')

In [13]:
#without decision boundaries
fig = px.scatter(
        merged_df, x="pc1", y="pc2", 
        color="classifier", color_continuous_scale='rdgy')
fig.update_traces(marker_size=8, marker_line_width=.5)


fig.show()

In [58]:
#example boundary
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import numpy as np
h = .2
data = load_iris()
X = data.data[:, :2]
y = data.target
X = StandardScaler().fit_transform(X)
trees = RandomForestClassifier(max_depth=4, 
                               n_estimators=20, 
                               random_state=0)
trees.fit(X, y)


x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h)
                     , np.arange(y_min, y_max, h))
y_ = np.arange(y_min, y_max, h)



svm = svm.SVC()
svm.fit(X,y)

Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
print(Z.shape)
Z = Z.reshape(xx.shape)
trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale='Viridis',
                  showscale=False)

trace2 = go.Scatter(x=X[:, 0], y=X[:, 1], 
                    mode='markers',
                    showlegend=False,
                    marker=dict(size=10,
                                color=y, 
                                colorscale='Viridis',
                                line=dict(color='black', width=1))
                    )
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)

(1216,)


In [56]:
#example with our dummy data
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm


h = .5
num_points = 100
X, lab = pd.concat([merged_df.pc1[:num_points],merged_df.pc2[:num_points]], axis=1), np.array([np.random.randint(2) for _ in range(num_points)])

X = StandardScaler().fit_transform(X)
trees = RandomForestClassifier(max_depth=40, n_estimators=5)
trees = AdaBoostClassifier()
trees.fit(X, y)

# svm = svm.SVC()
# svm.fit(X,y)

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h)
                    , np.arange(y_min, y_max, h))
y_ = np.arange(y_min, y_max, h)


Z = trees.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)



import plotly.graph_objs as go
color_scale = 'speed'
trace1 = go.Heatmap(x=xx[0], y=y_, z=Z,
                colorscale=color_scale,
                showscale=False)

trace2 = go.Scatter(x=merged_df.pc1[:num_points], y=merged_df.pc2[:num_points], 
                    mode='markers',
                    showlegend=False,
                    marker=dict(size=10,
                                color=lab, 
                                colorscale=color_scale,
                                line=dict(color='black', width=.4))
                    )
            
fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.show()

In [198]:
#persist boundary visualization, just display points each time
fig.write_json('boundary.json')
import plotly
f1 = plotly.io.read_json('../boundary.json')

trace2 = go.Scatter(x=pc1, y=pc2, 
                    mode='markers',
                    showlegend=False,
                    marker=dict(size=10,
                                color=lab, 
                                colorscale=color_scale,
                                line=dict(color='black', width=.4))
                    )
f1.add_trace(trace2)
f1.update_layout(width=800)
                

In [84]:
#topics data
from plotly.subplots import make_subplots
import plotly.graph_objects as go
def topic_plot(num_datasets):
    num_topics = 10
    num_rows = (num_datasets//2 + num_datasets%2)
    fig = make_subplots(rows=num_rows, cols=2)
    for i in range(1, num_rows+1):
        for j in range(1, 3):
            if not num_datasets:
                break
            fig.add_trace(
                    go.Bar(
                        x=[np.random.rand() for _ in range(num_topics)],
                        y=[f'topic_{i}' for i in range(num_topics)],
                        orientation='h',
                        ),
                row=i, col=j
            )
            num_datasets-=1
    fig.update_layout(bargap=0.17, width=1200, height=900)
    return fig
topic_plot(6)