# Task 1: Import an XML file using python

### xml file

In [None]:
<?xml version="1.0" encoding="UTF-8"?>
<catalog>
   <book id="b00k-101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>49.49</price>
      <publish_date>2000-10-01</publish_date>
      <description>An in-depth look at creating applications 
      with XML.</description>
   </book>
   <book id="b00k-102">
      <author>Ralls, Kim</author>
      <title>Midnight Rain</title>
      <genre>Fantasy</genre>
      <price>55.55</price>
      <publish_date>2000-12-16</publish_date>
      <description>A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.</description>
   </book>
</catalog>

### code

In [10]:
import xml.etree.ElementTree as ET

# parse XML file
tree = ET.parse('book.xml')
root = tree.getroot()

# access all elements and attributes
for child in root:
    print(child.tag, child.attrib)
    for subchild in child:
        print(subchild.tag, subchild.text)

book {'id': 'b00k-101'}
author Gambardella, Matthew
title XML Developer's Guide
genre Computer
price 49.49
publish_date 2000-10-01
description An in-depth look at creating applications 
      with XML.
book {'id': 'b00k-102'}
author Ralls, Kim
title Midnight Rain
genre Fantasy
price 55.55
publish_date 2000-12-16
description A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.


# Task 2: Import a JSON file and analyze how different parts of JSON file can be parsed according to the business use case

### JSON file

In [None]:
{
    "header": {
      "date": "2023-05-01",
      "source": "Top Inc",
      "destination": "Sister Corp"
    },
    "orders": [
      {
        "id": 1,
        "product": "sample A",
        "quantity": 10,
        "price": 100
      },
      {
        "id": 2,
        "product": "sample B",
        "quantity": 5,
        "price": 50
      },
      {
        "id": 3,
        "product": "sample C",
        "quantity": 3,
        "price": 75
      }
    ],
    "summary": {
      "total_quantity": 18,
      "total_price": 1750
    }
  }

### code

In [15]:
import json

# loading JSON file
with open('data.json') as f:
    data = json.load(f)

# Geting all header data
print("Header:")
for key, value in data['header'].items():
    print(f"{key}:", value)
print()

# Printing all orders in loop
print("Orders:")
for order in data['orders']:
    print("ID:", order['id'])
    print("Product:", order['product'])
    print("Quantity:", order['quantity'])
    print("Price:", order['price'])
    print()

# Accessing summary data
print("Summary:")
print("Total Quantity:", data['summary']['total_quantity'])
print("Total Price:", data['summary']['total_price'])


Header:
date: 2023-05-01
source: Top Inc
destination: Sister Corp

Orders:
ID: 1
Product: sample A
Quantity: 10
Price: 100

ID: 2
Product: sample B
Quantity: 5
Price: 50

ID: 3
Product: sample C
Quantity: 3
Price: 75

Summary:
Total Quantity: 18
Total Price: 1750


# Task 3: Import the breast cancer dataset from sklearn library and attach the target variable data to the features data and store it in a JSON file

In [46]:
from sklearn.datasets import load_breast_cancer
import json
import pandas as pd

# Loading the breast cancer dataset from sklearn datasets
data = load_breast_cancer()

# Extracting the features and target
features = data.data.tolist()
target = data.target.tolist()

# Attaching the target variable to the feature data
for i in range(len(features)):
    features[i].append(target[i])

# Convert the data to a list of dictionaries
data_list = []
for row in features:
    d = {}
    for i in range(len(data['feature_names'])):
        d[data['feature_names'][i]] = row[i]
    d['target'] = row[-1]
    data_list.append(d)

# Soring the data in a JSON file
with open('breast_cancer_data.json', 'w') as f:
    json.dump(data_list, f)

### checking json file

In [51]:
df = pd.read_json('breast_cancer_data.json')
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


# Task 4: Make a regression dataset (700) with 7 features while having 4 informative features and store them on disk in a csv file

In [48]:
import csv
from faker import Faker
import random

fake = Faker()

# Define the headers for the CSV file
headers = ['Price', 'Cost', 'Quantity', 'Category', 'Feature 5', 'Feature 6', 'Feature 7', 'Target']

# Generate the dataset
data = []
for i in range(700):
    row = []
    for j in range(7):
        if j < 4:  # First 4 features are informative
            if j == 0:
                row.append(random.gauss(50, 5))
            elif j == 1:
                row.append(random.uniform(0, 10) + 3 * i/700)  # Generate data using linear relationship
            elif j == 2:
                row.append(random.randint(1, 10))
            else:
                row.append(random.choice(['Shoe', 'Belt', 'Purse']))
        else:  # Last 3 features are non-informative
            row.append(fake.pyfloat(left_digits=3, right_digits=2, positive=True))
    target = row[0] + row[1] * 7 + 0.55 * row[2]  # Calculate target variable using a linear combination of features
    row.append(target)
    data.append(row)

# Write the dataset to a CSV file
with open('regression_dataset.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headers)
    writer.writerows(data)

### checking CSV file

In [50]:
pd.read_csv('regression_dataset.csv')

Unnamed: 0,Price,Cost,Quantity,Category,Feature 5,Feature 6,Feature 7,Target
0,41.955012,8.078251,1,Belt,114.99,734.45,910.13,99.052767
1,53.236705,8.323859,10,Shoe,546.84,737.88,654.10,117.003720
2,57.061849,6.667904,10,Shoe,257.23,756.37,512.00,109.237179
3,53.825041,2.160640,1,Belt,711.20,472.92,3.57,69.499525
4,47.095736,3.369250,4,Shoe,19.45,496.61,19.23,72.880487
...,...,...,...,...,...,...,...,...
695,58.062520,9.852828,2,Belt,252.29,239.85,919.97,128.132313
696,50.799760,11.791180,2,Belt,661.23,630.48,792.89,134.438018
697,45.636042,6.586275,8,Shoe,718.81,996.61,263.47,96.139967
698,60.911651,10.634996,2,Shoe,787.95,325.61,751.14,136.456620
