In [1]:
import csv
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from datetime import datetime, date
from dateutil import parser
import urllib3
import statistics
import warnings
import matplotlib.pyplot as plt
import seaborn
import numpy as np
from collections import Counter

In [2]:
datashades_raw_list = "shades.csv"

outside_list = ['http://data.ctdata.org/',
                'https://data.ci.newark.nj.us/',
                'https://open.jacksonms.gov/',
                'https://data.ca.gov/',
                'https://datagate.snap4city.org/'
                ]

def write_output_file(all_urls_final, filename):
    fieldnames = []
    for item in all_urls_final:
        if len(item.keys()) > len(fieldnames):
            fieldnames = item.keys()

    with open(filename, "w", encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)

        writer.writeheader()
        for item in all_urls_final:
            writer.writerow(item)

In [3]:
def datashades_clean_up(datashades_list):
    urls = []
    with open(datashades_list, newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            row_strip = row[0].strip()
            if row_strip[:5] == "href=":
                to_add = row_strip[37:-19]
                urls.append(to_add.split('%2F">')[0])
            else:
                pass

    clean_urls = []
    for item in urls:
        item = re.sub("%3A", ":", item)
        item = re.sub("%2F", "/", item)
        item = re.sub("%26", "%", item)
        item = re.sub("%3D", "=", item)
        item = re.sub("%3F", "?", item)
        item = re.sub("%23", "#", item)
        clean_urls.append(item)

    return clean_urls

def dataportals_clean_up():
    portals_df = pd.read_csv("https://raw.githubusercontent.com/okfn/dataportals.org/master/data/portals.csv")
    portals_list = list(portals_df.url)
    return portals_list

def url_setup(source, clean_urls):
    root_url_set = set()
    list_of_url_dicts_2 = []
    for item in clean_urls:
        root_url = item.split("/")[2]
        if root_url in root_url_set:
            pass
        else:
            root_url_set.add(root_url)
            url_dict = {}
            url_dict["source"] = source
            url_dict["source_url"] = item
            url_dict["root_url"] = root_url
            url_dict["base_url"] = item.split(root_url)[0]+root_url
            list_of_url_dicts_2.append(url_dict)
    return list_of_url_dicts_2

clean_shades_urls = datashades_clean_up(datashades_raw_list)
clean_portals_urls = dataportals_clean_up()
shades = url_setup("datashades.info", clean_shades_urls)
portals = url_setup("dataportals.org", clean_portals_urls)
wprdc = url_setup("WPRDC", outside_list)

In [4]:
def duplicate_removal_processing(list_of_lists_to_deduplicate):
    unique_urls = set()
    output_list = []
    for item in list_of_lists_to_deduplicate:
        for items in item:
            if items['root_url'] in unique_urls:
                pass
            else:
                output_list.append(items)
                unique_urls.add(items['root_url'])
    return output_list

list_of_lists = [shades, portals, wprdc]
combined_and_deduplicated = duplicate_removal_processing(list_of_lists)

In [None]:
def checking_for_response(passed_list):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    full_error_list = []
    count = 0
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    for item in passed_list:
        count+=1
        try:
            print(f'Now checking url #{count}: {item["source_url"]}')
            response = requests.get(item["source_url"], verify=False, headers=headers, timeout=120)
            status = response.status_code
            soup = BeautifulSoup(response.text, features="html.parser")
            name = str(soup.title.string)
            meta_tags = soup.find_all("meta")
            if len(meta_tags)>0:
                for element in meta_tags:
                    if element.get("name") == "generator":
                        generator = element.get("content")
                    else:
                        generator = ""
            else:
                generator = ""
        except AttributeError:
            name = "AttributeError"
        except requests.exceptions.SSLError as ssl_error:
            name = "SSL Error"
            generator = ""
        except requests.exceptions.ConnectionError as connect_error:
            name = "Connection Error"
            generator = ""
        except requests.exceptions.TooManyRedirects:
            name = "Too Many Redirects Error"
            generator = ""
        except requests.exceptions.Timeout:
            name = "Timeout"
            generator = ""
        except Exception as e:
                try:
                    name = e.response.text
                except:
                    name = "Error"
        item["name"] = name
        item["generator"] = generator
        item["status_code"] = status
    return passed_list

status_response = checking_for_response(combined_and_deduplicated)

In [7]:
stats_dict = {"datashades.info": {"original_url_list_length": len(shades), "count":0, "200":0}, 
              "dataportals.org": {"original_url_list_length": len(portals), "count":0, "200":0}, 
              "WPRDC": {"original_url_list_length": len(wprdc), "count":0, "200":0}}

for item in status_response:
    for items in stats_dict:
        if item['source'] == items:
            stats_dict[items]['count']+=1
            if item['status_code'] == 200:
                stats_dict[items]['200']+=1

for item in stats_dict:
    print(f'{item}: {stats_dict[item]}')

datashades.info: {'original_url_list_length': 516, 'count': 516, '200': 478}
dataportals.org: {'original_url_list_length': 592, 'count': 510, '200': 419}
WPRDC: {'original_url_list_length': 5, 'count': 5, '200': 5}


In [None]:
def ckan_status_show(passed_list):
    full_error_list = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    x=0
    for item in passed_list:
        x+=1
        print(f'Now performing a status_show api call on site #{x}: {item["root_url"]}')
        try:
            response = requests.get(f'{item["source_url"]}/api/3/action/status_show', verify=False, headers=headers, timeout=120)
            content = json.loads(response.content)
            item["api_base_url"] = content["result"]["site_url"]
            item["site_title"] = content["result"]["site_title"]
            item["version"] = content["result"]["ckan_version"]
            item["locale"] = content["result"]["locale_default"]
            item["extensions"] = content["result"]["extensions"]
            item["source_or_base"] = "source"
        except Exception as e:
            try:
                response = requests.get(f'{item["base_url"]}/api/3/action/status_show', verify=False, headers=headers, timeout=120)
                content = json.loads(response.content)
                item["api_base_url"] = content["result"]["site_url"]
                item["site_title"] = content["result"]["site_title"]
                item["version"] = content["result"]["ckan_version"]
                item["locale"] = content["result"]["locale_default"]
                item["extensions"] = content["result"]["extensions"]
                item["source_or_base"] = "base"
            except Exception as e:
                error_list = [item["source_url"], (e.args)]
                full_error_list.append(error_list)
                pass
    return passed_list

status_show = ckan_status_show(status_response)

In [None]:
def ckan_all_other_functions(passed_list):
    full_error_list_packages = []
    full_error_list_orgs = []
    full_error_list_tags = []
    full_error_list_dates = []
    full_error_list_new_dates = []
    x=0
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    api_calls = ['package_list', 'tag_list', 'organization_list']
    def api_check(dict_to_check, url_category, api_call):
        url = f'{dict_to_check[url_category]}/api/3/action/{api_call}'
        response = requests.get(url, verify=False, headers=headers, timeout=120)
        content = json.loads(response.content)
        dict_to_check[f"{api_call}_count"] = len(content["result"])
        dict_to_check[f"{api_call}_source_base_or_apibase"] = url_category
        return dict_to_check

    def date_check(dict_to_check, url_category, current_best_metadata_date):
        url = f'{dict_to_check[url_category]}/api/3/action/current_package_list_with_resources?limit=2000000'
        response = requests.get(url, verify=False, headers=headers, timeout=120)
        content = json.loads(response.content)
        for items in content["result"]:
            metadata_creation_date = items["metadata_created"]
            m_c_d = parser.parse(metadata_creation_date)
            if m_c_d < current_best_metadata_date:
                current_best_metadata_date = m_c_d
            else:
                pass
        item["oldest_metadata_created_date"] = current_best_metadata_date
        most_recent_update_date = content["result"][0]["metadata_modified"]
        item["most_recent_update_date"] = parser.parse(most_recent_update_date)
        item["dates_source_base_or_apibase"] = url_category
        return dict_to_check

    for item in passed_list:
        x+=1
        print(f'Now performing additional API calls on site #{x}: {item["root_url"]}')
        current_best_metadata_date = datetime.now()
        for call in api_calls:
            try:
                api_check(item, "source_url", call)
            except Exception as e:
                try:
                    api_check(item, "base_url", call)
                except Exception as e:
                    try:
                        api_check(item, "api_base_url", call)
                    except Exception as e:
                        error_list = [item["source_url"], (e.args)]
                        full_error_list_packages.append(error_list)
                        pass
        try:
            date_check(item, "source_url", current_best_metadata_date)
        except Exception as e:
            try:
                date_check(item, "base_url", current_best_metadata_date)
            except Exception as e:
                try:
                    date_check(item, "api_base_url", current_best_metadata_date)
                except Exception as e:
                    error_list = [item["source_url"], (e.args)]
                    full_error_list_dates.append(error_list)
    return passed_list

all_requests_made = ckan_all_other_functions(status_show)

In [10]:
write_output_file(all_requests_made, "ckan_requests_output.csv")

In [11]:
def analysis_prep(all_urls_final):
    keys_to_pop_1 = ['api_base_url',
               'site_title',
               'version',
               'locale',
               'extensions',
               'package_list_count',
               'organization_list_count',
               'tag_list_count',
               'oldest_metadata_created_date',
               'most_recent_update_date',
               ]
    keys_to_pop_2 = ['package_list_source_base_or_apibase',
                     'tag_list_source_base_or_apibase',
                     'organization_list_source_base_or_apibase',
                     'dates_source_base_or_apibase']

    for item in all_urls_final:
        for key in keys_to_pop_1:
            if key in item.keys():
#                 if pd.isna(item[key]):
#                     del item[key]
                if item[key] == '':
                    del item[key]
    for item in all_urls_final:
        for keys in keys_to_pop_2:
            if keys in item.keys():
                del item[keys]

#     for item in all_urls_final:
#         if 'oldest_metadata_created_date' in item.keys():
#             item['oldest_metadata_created_date'] = datetime.strptime(item['oldest_metadata_created_date'][:10], '%Y-%m-%d')
#         if 'most_recent_update_date' in item.keys():
#             item['most_recent_update_date'] = datetime.strptime(item['most_recent_update_date'][:10], '%Y-%m-%d')
    all_urls_final_df = pd.DataFrame(all_urls_final)
    all_urls_final_df.to_csv("all_urls_final-jul6.csv")
    return all_urls_final

prepped = analysis_prep(all_requests_made)

In [19]:
def package_counts(all_urls_final):
    count_of_instances_with_packages = 0
    packages_counts = []
    count_of_datasets_below_1000 = 0
    count_of_datasets_above_50k = 0
    for item in all_urls_final:
        try:
            if item['package_list_count']:
                count_of_instances_with_packages += 1
                packages_counts.append(int(item['package_list_count']))
                if int(item['package_list_count']) < 1001:
                    count_of_datasets_below_1000 += 1
                if int(item['package_list_count']) > 50000:
                    count_of_datasets_above_50k += 1
        except KeyError as e:
            pass
        package_median = statistics.median(packages_counts)
    export_dict = {"count_of_instances_with_packages": count_of_instances_with_packages,
                   "package_median": package_median,
                   "count_of_datasets_below_1000": count_of_datasets_below_1000,
                   "count_of_datasets_above_50k": count_of_datasets_above_50k
    }
    for item in export_dict:
        print(f'{item}: {export_dict[item]}')
        
package_stats = package_counts(prepped)

count_of_instances_with_packages: 365
package_median: 250
count_of_datasets_below_1000: 256
count_of_datasets_above_50k: 13


In [None]:
def ckan_packages_chart(all_urls_final):
    packages = []
    for row in all_urls_final:
        try:
            package_count = row['package_list_count']
            if package_count != '':
                packages.append(int(package_count))
        except KeyError:
            pass

    sorted_sizes_adjusted = sorted([1 if s == 0 else s for s in packages], reverse=True)
    sizes_plus_ranks = {'rank': range(len(sorted_sizes_adjusted)), 'package_count': sorted_sizes_adjusted}

    warnings.filterwarnings("ignore", category=UserWarning)
    fig, ax = plt.subplots(figsize=(10,10))
    seaborn.histplot(data=sizes_plus_ranks, x='package_count', log_scale=(True, False))
    ax.set_xlabel("Count of Datasets per Instance")
    ax.set_ylabel("Number of Instances")
    plt.xticks(rotation=45)
    ax.set_xticklabels(['0', '.1', '1', '10', '100', '1000', '10,000', '100,000', '1,000,000'])
    plt.title("Fig. 1: Bar Chart of CKAN Instance Size (Measured in Number of Datasets)\n", size=8)
    plt.savefig("packages_chart.png")
    plt.show()

ckan_packages_chart(prepped)

In [None]:
def time_calcs(all_urls_final):
    year_list = []
    time_dict = {}
    today = datetime.today()
    url_list = []
    overall_count = 0
    for y in range(2007, 2024):
        annual_count = 0
        age = 0
        timedeltas = []
        age_pct = []
        for item in all_urls_final:
            try:
                if 'oldest_metadata_created_date' in item.keys():
                    if 'most_recent_update_date' in item.keys():
                        if type(item['oldest_metadata_created_date']) == datetime:
                            if type(item['most_recent_update_date']) == datetime:
                                year_list.append((item['oldest_metadata_created_date'], item['most_recent_update_date']))
                                if int(item['oldest_metadata_created_date'].strftime('%Y')) == y:
                                    annual_count+=1
                                    overall_count+=1
                                    oldest = item['oldest_metadata_created_date']
                                    most_recent = item['most_recent_update_date']
                                    timedeltas.append(int((most_recent-oldest).days))
                                    age_pct.append(int((most_recent-oldest).days) / int((today - oldest).days))
                                    url_list.append(item['source_url'])

            except Exception as e:
                print(item['oldest_metadata_created_date'], item['source_url'], e)
        if len(timedeltas)>0:
            avg_age = round((sum(timedeltas)/len(timedeltas))/365, 2)
            median_age = round(statistics.median(timedeltas)/365, 2)
        else:
            avg_age = 0
            median_age = 0
        if len(age_pct)>0:
            age_pct_amt = round(((sum(age_pct))/(len(age_pct))), 2)
        else:
            age_pct_amt = 0
        time_dict[y] = {"year":y, "count": annual_count, "avg_age":avg_age, "median":median_age, "average_lifespan":age_pct_amt}
    return(time_dict, overall_count)

time_dict, overall_count = time_calcs(prepped)

In [None]:
def graphing(time_dict):
    time_dict = sorted(time_dict.items())
    list_of_time_dicts = [item[1] for item in time_dict]
    count_list = []
    age_list = []
    lifespan_list = []
    for item in list_of_time_dicts:
        if item["count"] > 0:
            count_list.append((item["year"], item["count"]))
            age_list.append((item["year"], item["avg_age"]))
            lifespan_list.append((item["year"], item["average_lifespan"]))

    x_axis_count, y_axis_count = zip(*count_list)
    x_axis_age, y_axis_age = zip(*age_list)
    x_axis_life, y_axis_life = zip(*lifespan_list)

    fig, ax = plt.subplots(figsize=(8, 8))
    bars = ax.bar(x_axis_count, y_axis_count)
    ax.set_xlabel("Year Launched", fontsize=12)
    ax.set_ylabel("Count of Instances", fontsize=12)
    x_list = [2007, 2011, 2013, 2015, 2017, 2019, 2021, 2023]
    x = np.array(x_list)
    plt.xticks(x, fontsize=12)
    ax.bar_label(bars)
    ax.set_xlim(2006, 2024)
    plt.title("Fig. 3: Count of CKAN Instances Launched Per Year, 2007-2023\n", size=8)
    plt.savefig("ckan_per_year.png")

    fig, ax = plt.subplots(figsize=(8, 8))
    bars = ax.bar(x_axis_life, y_axis_life, color="#1f77b4")
    ax.set_xlabel("\nYear Launched", fontsize=12)
    ax.set_ylabel("Average Percent of Lifetime Spent Active\n", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)
    x_list = [2007, 2011, 2013, 2015, 2017, 2019, 2021, 2023]
    x = np.array(x_list)
    labels = [str(item) for item in x_axis_life]
    plt.xticks(x, fontsize=12)
    rects = ax.patches
    for rect, label in zip(rects, y_axis_count):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2, height + 0.01, label, ha="center", va="bottom")
    fig.savefig("test.png", bbox_inches="tight")
    plt.title("Fig. 4: Average Percent of CKAN Instance 'Lifetime' Spent Active, by Year of Release\n", size=8)
    plt.savefig("ckan_lifetime.png")
    
graphing(time_dict)

In [23]:
def ckan_version(all_urls_final):
    version_list = []
    for item in all_urls_final:
        try:
            version_list.append(item['version'].split('.')[:2])
        except KeyError:
            pass

    for item in version_list:
        if "b" in item[1]:
            item[1] = item[1].replace("b", "")

    version_3_list = [item for item in version_list if item[0] == '3']
    for item in version_3_list:
        version_list.remove(item)

    version_3_list_final = [item for item in version_3_list if item[1] != "0#datapress"]
    a, b = zip(*version_list)
    version_count = Counter(b)
    version_dict = {f'2.{item}': version_count[item] for item in version_count}

    if len(version_3_list_final) > 0:
        c, d = zip(*version_3_list_final)
        version_3_count = Counter(d)
        version_3_dict = {f'3.{item}': version_3_count[item] for item in version_3_count}
        for item in version_3_dict:
            version_dict[item] = version_3_dict[item]
    x_data = list(sorted(version_dict))
    if '2.10' in x_data:
        if x_data[-1] == '2.10':
            pass
        elif x_data[2] == '2.10':
            x_data.pop(2)
            if x_data[-1] == '2.9':
                x_data.append('2.10')
            else:
                for x in range(len(x_data)):
                    if x_data[x][0] == '3':
                        insert_point = x
                        break
                x_data.insert(insert_point, '2.10')
    y_data = [version_dict[item] for item in x_data]
    return(x_data, y_data)

x_data, y_data = ckan_version(prepped)

In [None]:
def ckan_version_chart(x_data, y_data):
    fig, ax = plt.subplots(figsize=(8,8))
    bars = ax.bar(x_data, y_data, color="#1f77b4")
    ax.set_xlabel("\nCKAN Version", fontsize=12)
    ax.set_ylabel("Count of CKAN Instances\n", fontsize=12)
    ax.tick_params(axis='y', labelsize=10)
    labels = [str(item) for item in x_data]
    plt.xticks(labels, fontsize=12)
    rects = ax.patches
    for rect, label in zip(rects, y_data):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height+ 0.01, label, ha="center", va="bottom")
    fig.savefig("test.png", bbox_inches="tight")
    plt.title("Fig. 5: Count of Analyed CKAN Instances Using CKAN Versions 2.0-2.10\n", size=8)
    plt.savefig("version_count.png")
    plt.show()
    
ckan_version_chart(x_data, y_data)