In [20]:
from dotenv import load_dotenv
import os
import duckdb
import pandas as pd

load_dotenv()

csv_path = os.getenv("DATA_CSV")
parquet_path = os.getenv("DATA_PARQUET")
sample_path = os.getenv("DATA_SAMPLE")

Converts large .csv file to .parquet and checks conversion. Outputs basic info on dataset.

In [6]:
if not os.path.exists(parquet_path):
    duckdb.query(f"""
    COPY (
        SELECT *
        FROM read_csv_auto(
            '{csv_path}',
            ignore_errors = true,
            sample_size = -1,
            all_varchar = true
        )
    ) TO '{parquet_path}' 
    (FORMAT PARQUET, OVERWRITE TRUE);
    """)
else:
    print(f"Parquet file already exists at {parquet_path}, skipping conversion.")

Parquet file already exists at D:\ruggbk\datasets\farmersdirect\b0cd514b-b9cc-4972-a0c2-c91726e6d825.parquet, skipping conversion.


In [7]:
# Count rows in original CSV
original_rows = duckdb.query(f"""
    SELECT COUNT(*) FROM read_csv_auto(
        '{csv_path}',
        ignore_errors = true,
        sample_size = -1
    )
""").fetchall()[0][0]


In [8]:
# Count rows in Parquet
parquet_rows = duckdb.query(f"""
    SELECT COUNT(*) FROM '{parquet_path}'
""").fetchall()[0][0]

print(f"Original rows (with ignored errors): {original_rows}")
print(f"Parquet rows: {parquet_rows}")

Original rows (with ignored errors): 20304843
Parquet rows: 20304843


In [9]:
unique_question_count = duckdb.query(f"""
    SELECT COUNT(DISTINCT question_content)
    FROM '{parquet_path}'
""").fetchall()[0][0]
print(f"# of Unique Questions: {unique_question_count}")

# of Unique Questions: 5452290


In [10]:
unique_rows_maincolumns = duckdb.query(f"""
    SELECT COUNT(DISTINCT ROW(question_id, question_content, question_topic,response_id, response_content))
    FROM '{parquet_path}'
""").fetchall()[0][0]
print(f"# of Unique Rows: {unique_rows_maincolumns}")

# of Unique Rows: 18854432


In [11]:
unique_response_count = duckdb.query(f"""
    SELECT COUNT(DISTINCT response_id)
    FROM '{parquet_path}'
""").fetchall()[0][0]
print('# of Unique Responses: ' + str(unique_response_count))

# of Unique Responses: 16283143


In [12]:
df = pd.read_parquet(parquet_path)
df.shape  # (rows, columns)
df.info()  # column types and non-null counts


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20304843 entries, 0 to 20304842
Data columns (total 24 columns):
 #   Column                      Dtype 
---  ------                      ----- 
 0   question_id                 object
 1   question_user_id            object
 2   question_language           object
 3   question_content            object
 4   question_topic              object
 5   question_sent               object
 6   response_id                 object
 7   response_user_id            object
 8   response_language           object
 9   response_content            object
 10  response_topic              object
 11  response_sent               object
 12  question_user_type          object
 13  question_user_status        object
 14  question_user_country_code  object
 15  question_user_gender        object
 16  question_user_dob           object
 17  question_user_created_at    object
 18  response_user_type          object
 19  response_user_status        object
 20  

In [13]:
df.head()

Unnamed: 0,question_id,question_user_id,question_language,question_content,question_topic,question_sent,response_id,response_user_id,response_language,response_content,...,question_user_country_code,question_user_gender,question_user_dob,question_user_created_at,response_user_type,response_user_status,response_user_country_code,response_user_gender,response_user_dob,response_user_created_at
0,4107104,574194,eng,Q How can i start with one hen which is a laye...,chicken,2017-12-08 19:04:04+00,4215434,574940,eng,Q1 they should answer as hurry as possible.,...,ug,,,2017-12-08 10:50:34+00,farmer,zombie,ug,,,2017-12-08 18:33:04+00
1,4107104,574194,eng,Q How can i start with one hen which is a laye...,chicken,2017-12-08 19:04:04+00,4207349,574940,eng,Q1 they should answer as hurry as possible.,...,ug,,,2017-12-08 10:50:34+00,farmer,zombie,ug,,,2017-12-08 18:33:04+00
2,4107104,574194,eng,Q How can i start with one hen which is a laye...,chicken,2017-12-08 19:04:04+00,4215309,574940,eng,Q1 like which types of traps?,...,ug,,,2017-12-08 10:50:34+00,farmer,zombie,ug,,,2017-12-08 18:33:04+00
3,4107104,574194,eng,Q How can i start with one hen which is a laye...,chicken,2017-12-08 19:04:04+00,4200211,574940,eng,"Q1 if it is well fed , it takes 1 mnth 2 lay e...",...,ug,,,2017-12-08 10:50:34+00,farmer,zombie,ug,,,2017-12-08 18:33:04+00
4,4107104,574194,eng,Q How can i start with one hen which is a laye...,chicken,2017-12-08 19:04:04+00,4205121,574940,eng,Q1 by digging.,...,ug,,,2017-12-08 10:50:34+00,farmer,zombie,ug,,,2017-12-08 18:33:04+00


In [23]:
# Saves random sample of 1 million English questions / responses in DuckDB
if not os.path.exists(sample_path):
    duckdb.query(f"""
        COPY (
            SELECT *
            FROM read_parquet('{parquet_path}')
            WHERE question_language = 'eng'
              AND response_language = 'eng'
              AND response_content IS NOT NULL
            ORDER BY RANDOM()
            LIMIT 1000000
        ) TO '{sample_path}' (FORMAT PARQUET, OVERWRITE TRUE);
    """)
    print(f"Sample saved to {sample_path}")
else:
    print(f"Sample file already exists at {sample_path}, skipping sampling.")


Sample file already exists at D:\ruggbk\datasets\farmersdirect\farmers_1mil_eng_sample.parquet, skipping sampling.


In [None]:
# check sampling
pd.read_parquet(sample_path)

Unnamed: 0,question_id,question_user_id,question_language,question_content,question_topic,question_sent,response_id,response_user_id,response_language,response_content,...,question_user_country_code,question_user_gender,question_user_dob,question_user_created_at,response_user_type,response_user_status,response_user_country_code,response_user_gender,response_user_dob,response_user_created_at
0,50871840,2942156,eng,Q Why Wefarm Doesn't Prepare Our Promotion?,,2020-12-02 17:00:20.667286+00,50872828,269315,eng,Q451 it is the work of the org committee,...,ug,,,2020-03-06 13:36:15.449684+00,farmer,live,ug,,,2017-07-20 11:24:28+00
1,24215668,1004601,eng,what is mixed farming,,2019-04-25 15:07:44.78925+00,24222344,225067,eng,Q 418 Refers to growing of various crops on sa...,...,ug,,,2018-07-20 14:24:34.186844+00,farmer,live,ug,male,1984-10-28,2017-06-12 17:02:07+00
2,34970005,2697773,eng,Q to 6333 wc diseases that attack maize plant?,plant,2019-12-03 07:19:06.001165+00,34970061,1844517,eng,I don't no,...,ug,,,2019-11-25 15:12:45.486055+00,farmer,live,ug,,,2019-01-23 12:54:10.444483+00
3,47631987,963431,eng,What type of maize is good plant,maize,2020-09-30 12:14:43.623567+00,47632870,3249306,eng,"if for selling,popckons but if for food consu...",...,ug,,,2018-07-04 06:58:43.074057+00,farmer,live,ug,,,2020-07-26 16:23:11.415517+00
4,56862473,3474362,eng,Q what are the seedling,,2021-05-26 19:02:20.503138+00,56888364,3327551,eng,Q22 seedling are grown plants in seedbed for t...,...,ke,,,2020-11-27 08:44:57.419151+00,farmer,live,ke,,,2020-09-17 17:53:24.140724+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,5338768,359662,eng,How will you know that your cow is on heat,cattle,2018-03-24 16:00:30.163638+00,5345552,389259,eng,Q16 the cow mounts other cows,...,ke,,,2017-09-14 17:17:13+00,farmer,zombie,ke,,,2017-09-26 04:29:58+00
999996,29654231,2361244,eng,Which pest that attack the root of tea and how...,tea,2019-08-08 18:18:55.308216+00,29671088,1359668,eng,"Q99;Red ants, control used THUNDER chemical.",...,ke,,,2019-08-08 17:30:38.90231+00,farmer,live,ke,,,2018-10-18 17:42:49.166605+00
999997,7105245,938846,eng,Which Chemical Is Best In Weeding Maize?,maize,2018-06-26 15:38:23.882541+00,7109077,539166,eng,Q 105 dual gold,...,ke,,,2018-06-26 04:23:44.463892+00,farmer,live,ke,,,2017-11-27 06:38:06+00
999998,22633493,1968918,eng,Q HOW DO YOU DO AWAY WITH ACCUMULATED WASTES L...,cattle,2019-03-19 16:13:04.663386+00,22633848,1846683,eng,q11 deworm,...,ke,,,2019-03-19 15:58:37.464951+00,farmer,zombie,ke,,,2019-01-23 17:55:02.988204+00
