In [19]:
import pandas as pd
import json

In [20]:
# First extract the jsonl file from jsonl.gz file

def extract_jsonl(file_name):
    with open(file_name, 'r') as f:
        data = f.readlines()
    return data

In [21]:
# Now convert the jsonl file to a csv file

def jsonl_to_csv(data, csv_file):
    data = [json.loads(x) for x in data]
    df = pd.DataFrame(data)
    df.to_csv(csv_file, index=False)
    return df

In [22]:
# Now the files are named 'dataset1.jsonl.gz' to 'dataset9.jsonl.gz' and the csv files will be named 'dataset1.csv' to 'dataset9.csv'

for i in range(1, 10):
    data = extract_jsonl(f'dataset{i}.jsonl.gz')
    jsonl_to_csv(data, f'dataset{i}.csv')
    print(f'dataset{i}.csv created')

dataset1.csv created
dataset2.csv created
dataset3.csv created
dataset4.csv created
dataset5.csv created
dataset6.csv created
dataset7.csv created
dataset8.csv created
dataset9.csv created


In [24]:
# Create a general code for the files named 'file.csv' to be converted to 'file_clean.csv'

def clean_csv(file):
    df = pd.read_csv(file)
    df = df[df['description'] != '[]']
    df.to_csv(file.replace('.csv', '_clean.csv'), index=False)
    df = df[['main_category', 'title', 'features', 'description', 'images', 'details']]
    return df

In [25]:
# Now clean the files

for i in range(1, 10):
    df = clean_csv(f'dataset{i}.csv')
    print(f'dataset{i}_clean.csv created')

dataset1_clean.csv created


  df = pd.read_csv(file)


dataset2_clean.csv created
dataset3_clean.csv created
dataset4_clean.csv created
dataset5_clean.csv created
dataset6_clean.csv created
dataset7_clean.csv created
dataset8_clean.csv created
dataset9_clean.csv created


In [26]:
# Now merge all the files into one csv file in a randomised order

df = pd.concat([pd.read_csv(f'dataset{i}_clean.csv') for i in range(1, 10)])
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv('merged_dataset.csv', index=False)
print('merged_dataset.csv created')

merged_dataset.csv created


In [27]:
# Only have the following columns: 'main_category', 'title', 'features', 'description', 'images', 'details'

df = df[['main_category', 'title', 'features', 'description', 'images', 'details']]
df.to_csv('dataset.csv', index=False)
print('dataset.csv created')

dataset.csv created


In [28]:
# Now check how many null values or missing values are there in each column (one column at a time)

for col in df.columns:
    print(f'{col}: {df[col].isnull().sum()} missing values')

main_category: 4677 missing values
title: 23 missing values
features: 0 missing values
description: 0 missing values
images: 0 missing values
details: 0 missing values


In [31]:
# As we can see, only main_category and title have missing values. We can remove the rows with missing values

df = df.dropna()
df.to_csv('dataset_final.csv', index=False)
print('dataset_final.csv created')

dataset_final.csv created


In [32]:
# Some rows in features column have '[]' as values. We can remove those rows as well

df = df[df['features'] != '[]']
df.to_csv('dataset_final_cleaned.csv', index=False)
print('dataset_final_cleaned.csv created')

dataset_final_cleaned.csv created
