In [None]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import random
import urllib.request
import os
from dotenv import load_dotenv
from pathlib import Path

env_path = Path('.') / '.czi.env'
load_dotenv(dotenv_path=env_path)


# Configure the data dir path in .czi.env located in this directory
DATA_DIR_PATH = os.environ.get("DATA_DIR_PATH")
print(f'Using data directory {DATA_DIR_PATH}')


RAW_FILE_PATH = f'{DATA_DIR_PATH}/raw.tar.gz'
LINKED_FILE_PATH = f'{DATA_DIR_PATH}/linked.tar.gz'
RAW_TAR_URL = "https://datadryad.org/stash/downloads/file_stream/1822384"
LINKED_TAR_URL = "https://datadryad.org/stash/downloads/file_stream/1822388"
# DISAMBIGUATED_TAR_URL = "https://datadryad.org/stash/downloads/file_stream/1822387"

FILTERED_CZI_SOFTWARE_CSV = 'czi_software.csv'
SAMPLED_1000_CZI_CSV = 'sample_1000.csv'

In [None]:
# Population is non-normal (see czi_random_10000_histogram.png) so to test variances use Levene's test,
# Comparing the equality of variance between the full dataset and the sample
# https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
import dask.dataframe as dd

# Read TSV into a single dataframe, brute-force mapping all values to strings
df_filtered_czi = dd.read_csv(f'{DATA_DIR_PATH}/{FILTERED_CZI_SOFTWARE_CSV}')
df_sample = dd.read_csv(f"{DATA_DIR_PATH}/{SAMPLED_1000_CZI_CSV}")

df_filtered_czi = df_filtered_czi.set_index()
full_counts = df_filtered_czi.software.value_counts()
sample_counts = df_sample.software.value_counts()


In [None]:
# Can create stratified sample, here the y data are bins to split the #citations. Visually it's not so different from the fully random sample.

# First, re-read the software column from the complete dataset into a Series
# czi_software_df = pd.read_csv(f'{DATA_DIR_PATH}/czi_software.csv', usecols=['software'])
# czi_software_df.reset_index()
# full_software_series = czi_software_df.software

# Compare the original software series and this series and save the comparison result
# pd_series_len = full_software_series.size
# dd_series_len = software_series.size.compute()
# with open(f'{DATA_DIR_PATH}/series_comparison', 'w') as cf:
#     cf.write(f'Reread series size: {pd_series_len} | Dask series size: {dd_series_len}')

y=(np.linspace(0,0.999999999999,len(full_counts))*1000).astype(int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(full_counts, y,stratify=y, test_size=1000)
plt.hist(X_test, bins=1000)
plt.yscale('log', nonpositive='clip')
plt.savefig(f'{DATA_DIR_PATH}/czi_stratified_sample_histogram.png')
# Compare with czi_random_stratified_sample_1000_citation_histogram.png

In [None]:
# Does the sampled 1000 from the original sample of 10000 look like the original (with actual tests as well as visually) if we exclude the top 100 cited?
# -> Sometimes



# df_val_counts = pd.DataFrame(software_series_sample.value_counts())
# df_val_counts = df_val_counts.reset_index()
df_top100 = df_sample.nlargest(100,'mention_counts')
df_900_without_top100 = df_sample.nsmallest(900,'mention_counts')

df_top100.to_csv(f'{DATA_DIR_PATH}/CZI_sample_top_100.csv',index=True)
df_900_without_top100.to_csv(f'{DATA_DIR_PATH}/CZI_sample_900_without_top_100.csv',index=True)




In [None]:
# Prepare linking by reading in the mention IDs from the random sample
#import csv

#ids = []

#with open(f'{DATA_DIR_PATH}/czi_output_random_100.csv', 'r') as csvin:
#    csvr = csv.DictReader(csvin, delimiter=',')
#    for row in csvr:
#        ids.append(row['ID'])
#        # print(row)
#ids

In [None]:
#linked_files = [
#    'bioconductor_df.csv',
#    'cran_df.csv',
#    'github_df.csv',
#    'pypi_df.csv',
#    'scicrunch_df.csv'
#]
#
#lines = []
#
#import sys
#maxInt = sys.maxsize
#
#while True:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

#    try:
#        csv.field_size_limit(maxInt)
#        break
#    except OverflowError:
#        maxInt = int(maxInt/10)

#def extract_links(lf, _id):
#    with open(f'{DATA_DIR_PATH}/linked/linked/normalized/{lf}', 'r') as csvfile:
#        datareader = csv.DictReader(csvfile)
#        for row in datareader:
#            if row['ID'] == _id:
#                yield row

#for lf in linked_files:
#    for _id in ids:
#        for huh in extract_links(lf, _id):
#            lines.append(huh)

#lines

# df = dd.read_csv("/home/stephan/src/habeas-corpus/data/czi-software-mentions/linked/linked/normalized/bioconductor_df.csv")
# print(f'EMPTY? {len(df.index)}')
# df

In [None]:
#linked_lines = []

#with open(f'{DATA_DIR_PATH}/czi_output_random_100.csv', 'r', encoding='utf-8-sig') as csvin:
#    csvr = csv.DictReader(csvin, delimiter=',')
#    for row in csvr:
#        _id = row['ID']
#        for line in lines:
#            if line['ID'] == _id:
#                linked_line = row | line
#                linked_lines.append(linked_line)
#                continue
#        linked_lines.append(row)


In [None]:
#import json
#
#interim_df = pd.read_json(json.dumps(linked_lines))
#interim_df.to_csv(f'{DATA_DIR_PATH}/CZI_sample_1000_without_top_100_linked.csv', encoding='utf-8', index=True)

