# Downloading Datasets and converting them into csv

In [None]:
import requests
import csv
import json

def download_json(url):
    # Make a GET request to the URL to fetch JSON data
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to download JSON. Status Code: {response.status_code}")
        return None

def convert_json_to_csv(json_data, csv_file):
    # Extract the keys from the first element of the JSON data
    keys = list(json_data[0].keys())

    # Open CSV file in write mode
    with open(csv_file, 'w', newline='') as csvfile:
        # Create a CSV writer object
        csv_writer = csv.DictWriter(csvfile, fieldnames=keys)

        # Write the header to the CSV file
        csv_writer.writeheader()

        # Write each JSON record to the CSV file
        for record in json_data:
            csv_writer.writerow(record)

if __name__ == "__main__":
    # Replace 'YOUR_JSON_URL' with the actual URL of the JSON data
    json_url = 'https://ciir.cs.umass.edu/downloads/LaMP/LaMP_2/train/train_questions.json'

    # Replace 'output.csv' with the desired output CSV file name
    csv_file_name = 'usertraininput.csv'
    
    # Download JSON data from the specified URL
    json_data = download_json(json_url)
    
    if json_data:
        # Convert JSON data to CSV
        convert_json_to_csv(json_data, csv_file_name)
        print(f"Conversion successful. CSV file saved as '{csv_file_name}'.")
    
    json_url = 'https://ciir.cs.umass.edu/downloads/LaMP/LaMP_2/dev/dev_questions.json'

    # Replace 'output.csv' with the desired output CSV file name
    csv_file_name = 'uservalinput.csv'

    # Download JSON data from the specified URL
    json_data = download_json(json_url)

    if json_data:
        # Convert JSON data to CSV
        convert_json_to_csv(json_data, csv_file_name)
        print(f"Conversion successful. CSV file saved as '{csv_file_name}'.")


In [None]:
import requests
import csv
import json

def download_json(url):
    # Make a GET request to the URL to fetch JSON data
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to download JSON. Status Code: {response.status_code}")
        return None

def convert_json_to_csv(json_data, csv_file):
    # Extract the list of dictionaries under the "golds" key
    golds_data = json_data.get("golds", [])

    # Open CSV file in write mode
    with open(csv_file, 'w', newline='') as csvfile:
        # Create a CSV writer object
        csv_writer = csv.writer(csvfile)

        # Write the header to the CSV file
        csv_writer.writerow(["id", "output"])

        # Write each dictionary in the "golds" list to the CSV file
        for record in golds_data:
            csv_writer.writerow([record["id"], record["output"]])

if __name__ == "__main__":
    # Replace 'YOUR_JSON_URL' with the actual URL of the JSON data
    json_url = 'https://ciir.cs.umass.edu/downloads/LaMP/LaMP_2/train/train_outputs.json'

    # Replace 'output.csv' with the desired output CSV file name
    csv_file_name = 'usertrainoutput.csv'

    # Download JSON data from the specified URL
    json_data = download_json(json_url)
    
    if json_data:
        # Convert JSON data to CSV
        convert_json_to_csv(json_data, csv_file_name)
        print(f"Conversion successful. CSV file saved as '{csv_file_name}'.")
    
    json_url = 'https://ciir.cs.umass.edu/downloads/LaMP/LaMP_2/dev/dev_outputs.json'

    # Replace 'output.csv' with the desired output CSV file name
    csv_file_name = 'uservaloutput.csv'

    # Download JSON data from the specified URL
    json_data = download_json(json_url)

    if json_data:
        # Convert JSON data to CSV
        convert_json_to_csv(json_data, csv_file_name)
        print(f"Conversion successful. CSV file saved as '{csv_file_name}'.")


# Preprocessing on News Category Dataset

In [None]:
import pandas as pd
from io import StringIO

# Specify the path to your JSON file
json_file_path = 'News_Category_Dataset_v3.json'

# Read the JSON file
with open(json_file_path, 'r') as file:
    # Load the JSON data into a Pandas DataFrame
    df = pd.read_json(file, lines=True)

# Display the DataFrame
print(df)


df = df.rename(columns={'SCIENCE': 'science & technology', 'ARTS & CULTURE': 'culture & arts'})

# Display the DataFrame after renaming columns
print("\nDataFrame after renaming columns:")
print(df)

df.columns = df.columns.str.lower()
df.category=df.category.str.lower()

df.category= df.category.replace({'science': 'science & technology', 'tech': 'science & technology','arts & culture': 'culture & arts'})



# Specify the desired categories
desired_categories = ['business', 'style & beauty', 'entertainment', 'politics', 'sports', 'religion',
                       'food & drink', 'travel', 'women', 'healthy living', 'education',
                       'culture & arts', 'science & technology', 'parents', 'crime']

# Filter the DataFrame based on the desired categories
filtered_df = df[df['category'].isin(desired_categories)]

# Display the filtered DataFrame
print(filtered_df)


new_column_names = {'headline': 'title', 'short_description': 'text'}
filtered_df= filtered_df.rename(columns=new_column_names)

filtered_df.to_csv('modified_news.csv')

# Preprocessing on Books Dataset

In [None]:
books = pd.read_csv('BooksDataset/dataset.csv')
new_books=books[['description','categories']]
new_books['category'] = new_books['categories'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else None)
new_books['category']=new_books['category'].str.strip('[')
new_books['category']=new_books['category'].str.strip(']')

category = pd.read_csv('BooksDataset/categories.csv')

category['category_id']=category['category_id'].astype(str)
merged_df = pd.merge(new_books, category, left_on='category', right_on='category_id', how='inner')


category_mapping = {
    # Women (3 categories)
    "Pregnancy": "Women",
    "Women's Studies": "Women",
    "Health & Wholefood Cookery": "Women",

    # Religion (3 categories)
    "Bible Studies: For Individual Or Small Group Study": "Religion",
    "Buddhist Books": "Religion",
    "Religious Buildings": "Religion",

    # Politics (3 categories)
    "Political Science & Theory": "Politics",
    "Democracy": "Politics",
    "Political Cartoons": "Politics",

    # Style & Beauty (3 categories)
    "Fashion & Beauty Industries": "Style & Beauty",
    "Hairdressing & Salon Skills": "Style & Beauty",
    "Fashion Photography": "Style & Beauty",

    # Entertainment (3 categories)
    "Film Studies": "Entertainment",
    "Music History": "Entertainment",
    "Theater": "Entertainment",

    # Culture & Arts (3 categories)
    "Art History": "Culture & Arts",
    "Music Theory": "Culture & Arts",
    "Museums": "Culture & Arts",

    # Sports (3 categories)
    "Sports Training Manuals": "Sports",
    "Sports History": "Sports",
    "Sports Photography": "Sports",

    # Science & Technology (3 categories)
    "Physics Books": "Science & Technology",
    "Popular Science": "Science & Technology",
    "Science Museums": "Science & Technology",

    # Travel (3 categories)
    "Travel Guides": "Travel",
    "Travel Blogs": "Travel",
    "Travel Photography": "Travel",

    # Business (3 categories)
    "Business Management Books": "Business",
    "Investment Guides": "Business",
    "Marketing Textbooks": "Business",

    # Crime (3 categories)
    "True Crime Books": "Crime",
    "Criminal Psychology": "Crime",
    "Crime Documentaries": "Crime",

    # Education (3 categories)
    "Education Textbooks": "Education",
    "Teaching Guides": "Education",
    "Schools": "Education",

    # Healthy Living (3 categories)
    "Nutrition Books": "Healthy Living",
    "Exercise Guides": "Healthy Living",
    "Fitness Centers": "Healthy Living",

    # Parents (3 categories)
    "Parenting Books": "Parents",
    "Child Development Guides": "Parents",
    "Children's Books": "Parents",

    # Food & Drink (3 categories)
    "Cookbooks": "Food & Drink",
    "Food History": "Food & Drink",
    "Restaurants": "Food & Drink",
}


merged_df['category_name'] = merged_df['category_name'].replace(category_mapping)

# List of categories to filter out
categories_to_filter = [
    'Women', 'Religion', 'Politics', 'Style & Beauty', 'Entertainment',
    'Culture & Arts', 'Sports', 'Science & Technology', 'Travel', 'Business',
    'Crime', 'Education', 'Healthy Living', 'Parents', 'Food & Drink'
]

# Filtering rows based on the specified categories
filtered_df = merged_df[merged_df['category_name'].isin(categories_to_filter)]

filtered_df = filtered_df.dropna(subset=['description'])

filtered_df['category_name']=filtered_df['category_name'].str.lower()

filtered_df.to_csv('booksdatasetmodified.csv')
