In [None]:
import pandas as pd
import numpy as np
from pprint import pprint
from nanoHUB.application import Application
from nanoHUB.configuration import ClusteringConfiguration
from nanoHUB.pipeline.geddes.data import get_default_s3_client
from nanoHUB.dataaccess.lake import S3FileMapper
from nanoHUB.clustering.infra import create_clusters_repository, get_user_tools, add_cluster_info, get_clustered_one_day_users, get_unclassified_one_day_users
from ast import literal_eval
from collections import defaultdict


pd.set_option('display.max_columns', None)

# logger.debug('Testing')
application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

#salesforce = application.new_salesforce_engine()

s3_client = get_default_s3_client(application)
raw_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_raw)
processed_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_processed)

cluster_repo = create_clusters_repository(application, ClusteringConfiguration().bucket_name_processed)

derived_data_file_path = 'derived_data_for_users_with_one_day_users.csv'

In [None]:
tool_users_df = get_user_tools(processed_mapper)
# user_tools_df = get_tool_users(processed_mapper)
display(tool_users_df)

In [None]:
tool_users_df['names_tools'].value_counts()

In [None]:
classified_one_day_users_df = get_clustered_one_day_users(processed_mapper)

display(len(classified_one_day_users_df))
display(classified_one_day_users_df.head())
display(classified_one_day_users_df.tail())

In [None]:
classified_one_day_users_df['has_researcher_profile'].value_counts()

In [None]:
unclassified_one_day_users_df['has_researcher_profile'].value_counts()

In [None]:
unclassified_one_day_users_df = get_unclassified_one_day_users(processed_mapper)

display(len(unclassified_one_day_users_df))
display(unclassified_one_day_users_df.head())
display(unclassified_one_day_users_df.tail())

In [None]:
unclassified_user_tools_df = tool_users_df.loc[tool_users_df['user'].isin(unclassified_one_day_users_df['username']),:]

display(len(unclassified_user_tools_df))
display(unclassified_user_tools_df.head())
display(unclassified_user_tools_df.tail())

In [None]:
a = pd.Series([item for sublist in unclassified_user_tools_df['names_tools'] for item in sublist])
df = a.groupby(a).size().rename_axis('names_tools').reset_index(name='num_unclassified_users')


display(df.head(10))

In [None]:
sorted_tool_count_for_unclassfied_users = df.sort_values('num_unclassified_users', ascending=False)
sorted_tool_count_for_unclassfied_users = sorted_tool_count_for_unclassfied_users.set_index('names_tools').reset_index()
display(sorted_tool_count_for_unclassfied_users)

sorted_tool_count_for_unclassfied_users.to_csv('sorted_tool_count_for_unclassified_one_day_users.csv', index=False)

In [None]:
classified_user_tools_df = tool_users_df.loc[tool_users_df['user'].isin(classified_one_day_users_df['username']),:]

display(len(classified_user_tools_df))
display(classified_user_tools_df.head())
display(classified_user_tools_df.tail())

In [None]:
a = pd.Series([item for sublist in classified_user_tools_df['names_tools'] for item in sublist])
df = a.groupby(a).size().rename_axis('names_tools').reset_index(name='num_clustered_users')


display(df.head(10))

In [None]:
sorted_tool_count_for_clustered_users = df.sort_values('num_clustered_users', ascending=False)
sorted_tool_count_for_clustered_users = sorted_tool_count_for_clustered_users.set_index('names_tools').reset_index()
display(sorted_tool_count_for_clustered_users)

sorted_tool_count_for_clustered_users.to_csv('sorted_tool_count_for_clustered_one_day_users.csv', index=False)

In [None]:
sorted_tools_unclassified_users = sorted_tool_count_for_unclassfied_users['names_tools'].to_list()
sorted_tools_clustered_users = sorted_tool_count_for_clustered_users['names_tools'].to_list()

In [None]:
display(sorted_tool_count_for_unclassfied_users)
print("********************")
display(sorted_tool_count_for_clustered_users)
print("********************")

combined_df = sorted_tool_count_for_unclassfied_users.merge(
    sorted_tool_count_for_clustered_users, on = ['names_tools'], how='outer'
)
combined_df['num_unclassified_users'].fillna(0, inplace=True)
combined_df['num_clustered_users'].fillna(0, inplace=True)

combined_df['num_total_users'] = combined_df['num_unclassified_users'] + combined_df['num_clustered_users']
combined_df = combined_df.sort_values('num_total_users', ascending=False)


display(combined_df)

In [None]:
import seaborn as sns
import pandas as pd
sns.set_theme()
sns.relplot(data=combined_df[combined_df['num_clustered_users'].between(50, 450)], 
            y="names_tools", x="num_clustered_users", hue="num_unclassified_users", 
            size="num_unclassified_users", sizes=(1,300))

In [None]:
sns.relplot(data=combined_df, 
            y='num_unclassified_users', x="num_clustered_users")

In [None]:
combined_df.to_csv('sorted_tool_count_for_combined_one_day_users.csv', index=False)

# print(sorted_tool_count_for_unclassfied_users['sbcnfet'])
# print(sorted_tool_count_for_clustered_users['sbcnfet'])
# display(combined_df.loc[combined_df['tool_name'] == 'sbcnfet'])

# print(sorted_tool_count_for_unclassfied_users['cenems'])
# print(sorted_tool_count_for_clustered_users['cenems'])
# display(combined_df.loc[combined_df['tool_name'] == 'cenems'])

# # print(unclassied_counts['chemkinetics'])
# print(sorted_tool_count_for_clustered_users['chemkinetics'])
# display(combined_df.loc[combined_df['tool_name'] == 'chemkinetics'])

In [None]:
# import difflib
# sm = difflib.SequenceMatcher(None, sorted_tools_unclassified_users, sorted_tools_clustered_users)
# similarity_ratio = sm.ratio()

# display(similarity_ratio)

In [None]:
# total_tools_count = len(sorted_tools_clustered_users + sorted_tools_unclassified_users)
# common_tools_count = len(common_tools)

# percentage_common_tools = (common_tools_count*100)/total_tools_count

# display(percentage_common_tools)

In [None]:
common_tools = list(set(sorted_tools_unclassified_users) & set(sorted_tools_clustered_users))

display(len(common_tools))
# display(common_tools)