In [8]:
# Step 2: Import required libraries
import json
import bz2
from elasticsearch import Elasticsearch, helpers
import pandas as pd

# Step 3: Connect to Elasticsearch
es = Elasticsearch(['http://elastic:9200'])

# Step 4: Create the Index with Zero Replicas
def create_index():
    es.options(ignore_status=[400]).indices.create(
        index='wikidata',
        body={
            'settings': {
                'number_of_shards': 1,
                'number_of_replicas': 0
            }
        }
    )

# Step 5: Function to parse the Wikidata JSON dump and extract labels
def parse_wikidata_dump(file_path, limit):
    count = 0
    with bz2.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.endswith(','):
                line = line[:-1]
            if line in ['[', ']']:
                continue

            try:
                item = json.loads(line)
                if 'labels' in item:
                    labels = item['labels']
                    if 'en' in labels:  # Extract English labels
                        yield {
                            '_index': 'wikidata',
                            '_source': {
                                'id': item['id'],
                                'label': labels['en']['value']
                            }
                        }
                        count += 1
                        if count >= limit:
                            break
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

# Step 6: Index data into Elasticsearch
def index_wikidata(file_path, limit):
    actions = parse_wikidata_dump(file_path, limit)
    helpers.bulk(es, actions)

# Step 7: Create the index, load, and index the data
create_index()
wikidata_dump_file = 'my-data/latest-all.json.bz2'
sample_size = 1000  # Number of items to index
index_wikidata(wikidata_dump_file, sample_size)

# Step 8: Verify the indexed data
res = es.search(index="wikidata", body={"query": {"match_all": {}}, "size": sample_size})
for hit in res['hits']['hits']:
    print(hit['_source'])

{'id': 'Q31', 'label': 'Belgium'}
{'id': 'Q8', 'label': 'happiness'}
{'id': 'Q23', 'label': 'George Washington'}
{'id': 'Q24', 'label': 'Jack Bauer'}
{'id': 'Q42', 'label': 'Douglas Adams'}
{'id': 'Q1868', 'label': 'Paul Otlet'}
{'id': 'Q2013', 'label': 'Wikidata'}
{'id': 'Q45', 'label': 'Portugal'}
{'id': 'Q51', 'label': 'Antarctica'}
{'id': 'Q58', 'label': 'penis'}
{'id': 'Q68', 'label': 'computer'}
{'id': 'Q75', 'label': 'Internet'}
{'id': 'Q102', 'label': 'pneumonoultramicroscopicsilicovolcanoconiosis'}
{'id': 'Q103', 'label': 'Supercalifragilisticexpialidocious'}
{'id': 'Q125', 'label': 'November'}
{'id': 'Q140', 'label': 'lion'}
{'id': 'Q144', 'label': 'dog'}
{'id': 'Q147', 'label': 'kitten'}
{'id': 'Q148', 'label': "People's Republic of China"}
{'id': 'Q155', 'label': 'Brazil'}
{'id': 'Q163', 'label': 'Yorkshire'}
{'id': 'Q177', 'label': 'pizza'}
{'id': 'Q178', 'label': 'pasta'}
{'id': 'Q183', 'label': 'Germany'}
{'id': 'Q207', 'label': 'George W. Bush'}
{'id': 'Q210', 'label': 

In [3]:
import pandas as pd
df = next(pd.read_csv("./my-data/organization_descriptions.csv", chunksize=100))

In [4]:
df

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,description
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,178728,2007-05-25 13:51:27,2024-03-21 11:01:38,Wetpaint is a technology platform company that...
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,181583,2007-05-26 02:30:28,2024-04-16 03:23:50,"Zoho offers a suite of business, collaboration..."
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,31367,2007-05-26 03:03:23,2024-01-26 08:06:40,Digg Inc. operates a website that enables its ...
3,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,79998,2007-05-26 03:21:34,2024-06-06 16:26:51,"Pierre Omidyar, the founder of eBay, and his w..."
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Meta,organization,facebook,https://www.crunchbase.com/organization/facebook,12068,2007-05-26 04:22:15,2023-11-27 12:05:27,Meta is a social technology company that enabl...
...,...,...,...,...,...,...,...,...,...
95,b7c000b8-0b00-9936-7e3f-7954f1af2949,Flybridge,organization,flybridge-capital,https://www.crunchbase.com/organization/flybri...,248279,2007-06-28 15:32:52,2024-06-06 09:37:48,Flybridge is a seed-stage VC investing with en...
96,93ef5a7c-8369-f255-8533-0861c3ad43ad,CriticalMetrics,organization,criticalmetrics,https://www.crunchbase.com/organization/critic...,2840402,2007-06-28 15:48:04,2019-06-24 21:22:32,"A startup by Suck.com alum, Joey Anuff, Critic..."
97,424cedf8-20ac-2ffe-8a09-689a76f69e3c,ZenZui,organization,zenzui,https://www.crunchbase.com/organization/zenzui,1764950,2007-06-28 15:51:40,2020-07-08 03:12:53,ZenZui a new mobile browser that aims to make ...
98,d68fcedd-3a0d-28f9-4805-9f22211c364f,Spock,organization,spock,https://www.crunchbase.com/organization/spock,333083,2007-06-28 16:02:56,2023-12-05 10:11:05,Spock is a people search engine which collects...


In [5]:
import pandas as pd
df = next(pd.read_csv("./my-data/organizations.csv", chunksize=100))
df

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,legal_name,roles,...,phone,facebook_url,linkedin_url,twitter_url,logo_url,alias1,alias2,alias3,primary_role,num_exits
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,178728,2007-05-25 13:51:27,2024-03-21 11:01:38,,company,...,206-859-6300,https://www.facebook.com/Wetpaint,https://www.linkedin.com/company/recruitment-c...,https://twitter.com/wetpainttv,https://images.crunchbase.com/image/upload/t_c...,,,,company,
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,181583,2007-05-26 02:30:28,2024-04-16 03:23:50,Zoho Corporation Pvt. Ltd.,"investor,company",...,1800 103 1123 /1800 572 3535,http://www.facebook.com/zoho,https://www.linkedin.com/company/zoho,http://twitter.com/zoho,https://images.crunchbase.com/image/upload/t_c...,,,,company,2.0
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,31367,2007-05-26 03:03:23,2024-01-26 08:06:40,"Digg Holdings, LLC",company,...,,http://www.facebook.com/digg,http://www.linkedin.com/company/digg,http://twitter.com/digg,https://images.crunchbase.com/image/upload/t_c...,,,,company,
3,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,79998,2007-05-26 03:21:34,2024-06-06 16:26:51,Omidyar Network Services LLC,"investor,company",...,650.482.2500,http://www.facebook.com/OmidyarNetwork,http://www.linkedin.com/company/22806,http://twitter.com/OmidyarNetwork,https://images.crunchbase.com/image/upload/t_c...,,,,investor,63.0
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Meta,organization,facebook,https://www.crunchbase.com/organization/facebook,12068,2007-05-26 04:22:15,2023-11-27 12:05:27,"Meta Platforms, Inc.","investor,company",...,,https://www.facebook.com/Meta,https://www.linkedin.com/company/meta,https://www.twitter.com/Meta,https://images.crunchbase.com/image/upload/t_c...,Facebook,"Facebook, Inc.",FB,company,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3d02b92f-668f-1a97-5c12-5d7646ae3ffe,Aggregate Knowledge,organization,aggregateknowledge,https://www.crunchbase.com/organization/aggreg...,198144,2007-06-28 15:04:25,2021-09-20 12:24:55,"Aggregate Knowledge, Inc.",company,...,(877) 245-5277,,https://www.linkedin.com/company/aggregate-kno...,http://twitter.com/akIntelligence,https://images.crunchbase.com/image/upload/t_c...,,,,company,
96,74cb3f14-aac0-d2a7-df71-0fd6236cda6b,Zing Systems,organization,zing,https://www.crunchbase.com/organization/zing,377344,2007-06-28 15:27:24,2024-01-09 18:29:37,,company,...,(650)267-2400,,,,https://images.crunchbase.com/image/upload/t_c...,,,,company,
97,b7c000b8-0b00-9936-7e3f-7954f1af2949,Flybridge,organization,flybridge-capital,https://www.crunchbase.com/organization/flybri...,248279,2007-06-28 15:32:52,2024-06-06 09:37:48,Flybridge Capital Partners,"investor,company",...,617.307.9292,,https://www.linkedin.com/company/flybridge-cap...,http://twitter.com/flybridge,https://images.crunchbase.com/image/upload/t_c...,IDG Ventures Boston,,,investor,74.0
98,93ef5a7c-8369-f255-8533-0861c3ad43ad,CriticalMetrics,organization,criticalmetrics,https://www.crunchbase.com/organization/critic...,2840402,2007-06-28 15:48:04,2019-06-24 21:22:32,,company,...,,,,,https://images.crunchbase.com/image/upload/t_c...,,,,company,


In [40]:
import bz2
import json
import os
import sys
import traceback
from pymongo import MongoClient
from tqdm import tqdm
from datetime import datetime


def create_indexes(db):
    # Specify the collections and their respective fields to be indexed
    index_specs = {
        'cache': ['cell', 'lastAccessed', "limit"],  # Example: Indexing 'cell' and 'type' fields in 'cache' collection
        'items': ['id_entity', 'entity', 'category', 'popularity'],
        'literals': ['id_entity', 'entity'],
        'types': ['id_entity', 'entity']
    }

    for collection, fields in index_specs.items():
        if collection == "cache":
            db[collection].create_index(
                [
                    ("name", 1),
                    ("limit", 1),
                    ("kg", 1),
                    ("fuzzy", 1),
                    ("types", 1),
                    ("kind", 1),
                    ("NERtype", 1),
                    ("language", 1),
                ],
                unique=True,
                background=True,  # Create the index in the background
            )
        elif collection == "items":
            db[collection].create_index([('entity', 1), ('kind', 1)], unique=True)    
        for field in fields:
            db[collection].create_index([(field, 1)])  # 1 for ascending order


# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
current_date = datetime.now()
formatted_date = current_date.strftime("%d%m%Y")
DB_NAME = f"crunchbase"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT)
log_c = client[DB_NAME].log
items_c = client[DB_NAME].items
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "literals":literals_c, 
    "types":types_c
}

create_indexes(client[DB_NAME])

buffer = {
    "items": [],
    "literals": [], 
    "types": []
}


def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []
            

def classify_value(value):
    try:
        # Check if value is a datetime
        dateutil.parser.isoparse(value)
        return 'DATETIME'
    except (ValueError, TypeError):
        pass
    try:
        # Check if value is a number
        float(value)
        return 'NUMBER'
    except (ValueError, TypeError):
        pass
    # If neither, it's a string
    return 'STRING'
    
def parse_data(index, columns, data, addional_data):
    objects = {}
    literals = {datatype: {} for datatype in ["STRING", "DATETIME", "NUMBER"]}
    types = {"P31": ["Organization"]}
    join = {
        "items": {
            "id_entity": i,
            "entity": entity,
            "description": description,
            "labels": all_labels,
            "aliases": all_aliases,
            "types": types,
            "popularity": popularity,
            "kind": "entity",
            "NERtype": "ORG"
        },
        "objects": { 
            "id_entity": i,
            "entity": entity,
            "objects":objects
        },
        "literals": { 
            "id_entity": i,
            "entity": entity,
            "literals": literals
        },
        "types": { 
            "id_entity": i,
            "entity": entity,
            "types": types
        },
    }

    

    for key in buffer:
        buffer[key].append(join[key])            

    if len(buffer["items"]) == BATCH_SIZE:
        flush_buffer(buffer)


           
# Read large CSV file in chunks
chunk_size = 1000  # Adjust chunk size as needed
file_path = './my-data/organizations.csv'  # Update with your file path

# Determine the number of chunks for progress bar
total_lines = sum(1 for _ in open(file_path))
total_chunks = total_lines // chunk_size + (1 if total_lines % chunk_size != 0 else 0)
index = 0
# Process the file in chunks
with tqdm(total=total_chunks, desc="Processing") as pbar:
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        #process_and_insert(chunk, items)
        columns = chunk.columns
        for _, data in chunk.iterrows():
            for column in columns:
                print(data[column])  
        pbar.update(1)

print("Finished processing and inserting documents.")

Processing: 100%|█████████▉| 3493/3494 [00:24<00:00, 141.00it/s]

Finished processing and inserting documents.





Index(['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at',
       'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url',
       'country_code', 'state_code', 'region', 'city', 'address',
       'postal_code', 'status', 'short_description', 'category_list',
       'category_groups_list', 'num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code', 'founded_on',
       'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3', 'primary_role', 'num_exits'],
      dtype='object')

In [None]:
columns = chunk.columns
for _, data in chunk.iterrows():
    for column in columns:
        print(data[column])   
    break

In [3]:
import pandas as pd
df = next(pd.read_csv("../data/organizations.csv", chunksize=100))
df

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,legal_name,roles,...,phone,facebook_url,linkedin_url,twitter_url,logo_url,alias1,alias2,alias3,primary_role,num_exits
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,178728,2007-05-25 13:51:27,2024-03-21 11:01:38,,company,...,206-859-6300,https://www.facebook.com/Wetpaint,https://www.linkedin.com/company/recruitment-c...,https://twitter.com/wetpainttv,https://images.crunchbase.com/image/upload/t_c...,,,,company,
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,181583,2007-05-26 02:30:28,2024-04-16 03:23:50,Zoho Corporation Pvt. Ltd.,"investor,company",...,1800 103 1123 /1800 572 3535,http://www.facebook.com/zoho,https://www.linkedin.com/company/zoho,http://twitter.com/zoho,https://images.crunchbase.com/image/upload/t_c...,,,,company,2.0
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,31367,2007-05-26 03:03:23,2024-01-26 08:06:40,"Digg Holdings, LLC",company,...,,http://www.facebook.com/digg,http://www.linkedin.com/company/digg,http://twitter.com/digg,https://images.crunchbase.com/image/upload/t_c...,,,,company,
3,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,79998,2007-05-26 03:21:34,2024-06-06 16:26:51,Omidyar Network Services LLC,"investor,company",...,650.482.2500,http://www.facebook.com/OmidyarNetwork,http://www.linkedin.com/company/22806,http://twitter.com/OmidyarNetwork,https://images.crunchbase.com/image/upload/t_c...,,,,investor,63.0
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Meta,organization,facebook,https://www.crunchbase.com/organization/facebook,12068,2007-05-26 04:22:15,2023-11-27 12:05:27,"Meta Platforms, Inc.","investor,company",...,,https://www.facebook.com/Meta,https://www.linkedin.com/company/meta,https://www.twitter.com/Meta,https://images.crunchbase.com/image/upload/t_c...,Facebook,"Facebook, Inc.",FB,company,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3d02b92f-668f-1a97-5c12-5d7646ae3ffe,Aggregate Knowledge,organization,aggregateknowledge,https://www.crunchbase.com/organization/aggreg...,198144,2007-06-28 15:04:25,2021-09-20 12:24:55,"Aggregate Knowledge, Inc.",company,...,(877) 245-5277,,https://www.linkedin.com/company/aggregate-kno...,http://twitter.com/akIntelligence,https://images.crunchbase.com/image/upload/t_c...,,,,company,
96,74cb3f14-aac0-d2a7-df71-0fd6236cda6b,Zing Systems,organization,zing,https://www.crunchbase.com/organization/zing,377344,2007-06-28 15:27:24,2024-01-09 18:29:37,,company,...,(650)267-2400,,,,https://images.crunchbase.com/image/upload/t_c...,,,,company,
97,b7c000b8-0b00-9936-7e3f-7954f1af2949,Flybridge,organization,flybridge-capital,https://www.crunchbase.com/organization/flybri...,248279,2007-06-28 15:32:52,2024-06-06 09:37:48,Flybridge Capital Partners,"investor,company",...,617.307.9292,,https://www.linkedin.com/company/flybridge-cap...,http://twitter.com/flybridge,https://images.crunchbase.com/image/upload/t_c...,IDG Ventures Boston,,,investor,74.0
98,93ef5a7c-8369-f255-8533-0861c3ad43ad,CriticalMetrics,organization,criticalmetrics,https://www.crunchbase.com/organization/critic...,2840402,2007-06-28 15:48:04,2019-06-24 21:22:32,,company,...,,,,,https://images.crunchbase.com/image/upload/t_c...,,,,company,


In [1]:
import pandas as pd
df = next(pd.read_csv("./my-data/organization_descriptions.csv", chunksize=100))
df

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,description
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,wetpaint,https://www.crunchbase.com/organization/wetpaint,178728,2007-05-25 13:51:27,2024-03-21 11:01:38,Wetpaint is a technology platform company that...
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,zoho,https://www.crunchbase.com/organization/zoho,181583,2007-05-26 02:30:28,2024-04-16 03:23:50,"Zoho offers a suite of business, collaboration..."
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,digg,https://www.crunchbase.com/organization/digg,31367,2007-05-26 03:03:23,2024-01-26 08:06:40,Digg Inc. operates a website that enables its ...
3,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,omidyar-network,https://www.crunchbase.com/organization/omidya...,79998,2007-05-26 03:21:34,2024-06-06 16:26:51,"Pierre Omidyar, the founder of eBay, and his w..."
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Meta,organization,facebook,https://www.crunchbase.com/organization/facebook,12068,2007-05-26 04:22:15,2023-11-27 12:05:27,Meta is a social technology company that enabl...
...,...,...,...,...,...,...,...,...,...
95,b7c000b8-0b00-9936-7e3f-7954f1af2949,Flybridge,organization,flybridge-capital,https://www.crunchbase.com/organization/flybri...,248279,2007-06-28 15:32:52,2024-06-06 09:37:48,Flybridge is a seed-stage VC investing with en...
96,93ef5a7c-8369-f255-8533-0861c3ad43ad,CriticalMetrics,organization,criticalmetrics,https://www.crunchbase.com/organization/critic...,2840402,2007-06-28 15:48:04,2019-06-24 21:22:32,"A startup by Suck.com alum, Joey Anuff, Critic..."
97,424cedf8-20ac-2ffe-8a09-689a76f69e3c,ZenZui,organization,zenzui,https://www.crunchbase.com/organization/zenzui,1764950,2007-06-28 15:51:40,2020-07-08 03:12:53,ZenZui a new mobile browser that aims to make ...
98,d68fcedd-3a0d-28f9-4805-9f22211c364f,Spock,organization,spock,https://www.crunchbase.com/organization/spock,333083,2007-06-28 16:02:56,2023-12-05 10:11:05,Spock is a people search engine which collects...


In [None]:
# Read large CSV file in chunks
chunk_size = 1000  # Adjust chunk size as needed
file_path = './my-data/organization_descriptions.csv'  # Update with your file path

# Determine the number of chunks for progress bar
total_lines = sum(1 for _ in open(file_path))
total_chunks = total_lines // chunk_size + (1 if total_lines % chunk_size != 0 else 0)
index = 0
data = {}
# Process the file in chunks
with tqdm(total=total_chunks, desc="Processing") as pbar:
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        columns = chunk.columns
        for _, data in chunk.iterrows():
            id = data["uuid"]
            url = data["cb_url"]
            popularity = data["rank"]
            description = data["description"]
            data[id] = {
                "url": url,  
                "description": description,
                "popularity": popularity
            }
        pbar.update(1)

In [4]:
import pandas as pd
df = pd.read_csv("../data/organizations.csv")

In [8]:
df["rank"]

0          178728.0
1          181583.0
2           31367.0
3           79998.0
4           12068.0
             ...   
3492519         NaN
3492520         NaN
3492521         NaN
3492522         NaN
3492523         NaN
Name: rank, Length: 3492524, dtype: float64

In [11]:
int(df["rank"].mean())

1670122

In [1]:
! pip install column-classifier==0.1.0

Collecting column-classifier==0.1.0
  Downloading column_classifier-0.1.0-py3-none-any.whl.metadata (729 bytes)
Collecting spacy (from column-classifier==0.1.0)
  Downloading spacy-3.7.6.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11 (from spacy->column-classifier==0.1.0)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy->column-classifier==0.1.0)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy->column-classifier==0.1.0)
  Using cached murmurhash-1.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (2.0 kB

In [5]:
! python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [2]:
from column_classifier.column_classifier import ColumnClassifier

classifier = ColumnClassifier()