**Title**: Data Wrangling 9.2 Exercises  
**Author**: Ryan Weeks  
**Date**: 2/8/2025  
**Description**:  These exercises focus on working with various data formats in Python, including NumPy arrays, CSV, JSON, Excel, and web scraping with BeautifulSoup. They involve reading, writing, transforming, and extracting meaningful information from structured data sources.

In [3]:
import numpy as np
import pandas as pd

# Set the seed for reproducibility
np.random.seed(30)

# Generate a 3x4 NumPy array with random values
array = np.random.rand(3, 4)

# Save the array as a CSV file
np.savetxt("np.csv", array, delimiter=",")

# Read the CSV file into a DataFrame
df = pd.read_csv("np.csv", header=None)

# Print the DataFrame
print(df)

# Write the DataFrame to a new CSV file
df.to_csv("df_output.csv", index=False, header=False)

          0         1         2         3
0  0.644144  0.380748  0.663048  0.163651
1  0.962608  0.346662  0.991751  0.235058
2  0.585694  0.406690  0.136234  0.544136


In [5]:
import os

# Step 1: Generate a 365x4 NumPy array with random values
np.random.seed(30)
array = np.random.rand(365, 4)

# Step 2: Store the array in a CSV file
csv_filename = "random_data.csv"
np.savetxt(csv_filename, array, delimiter=",")

# Step 3: Check the size of the CSV file
csv_size = os.path.getsize(csv_filename)
print(f"Size of CSV file: {csv_size} bytes")

# Step 4: Save the array in NumPy format
npy_filename = "random_data.npy"
np.save(npy_filename, array)

# Step 5: Load the array back from the NumPy file
loaded_array = np.load(npy_filename)

# Step 6: Check the shape of the loaded array
print(f"Shape of loaded array: {loaded_array.shape}")

# Step 7: Check the size of the NumPy file
npy_size = os.path.getsize(npy_filename)
print(f"Size of NumPy file: {npy_size} bytes")

# Step 8: Create a DataFrame from the array
df = pd.DataFrame(loaded_array)

# Step 9: Save the DataFrame to a pickle file
pickle_filename = "random_data.pkl"
df.to_pickle(pickle_filename)

# Step 10: Retrieve the DataFrame from the pickle
loaded_df = pd.read_pickle(pickle_filename)

# Step 11: Print the size of the pickle file
pickle_size = os.path.getsize(pickle_filename)
print(f"Size of Pickle file: {pickle_size} bytes")

Size of CSV file: 36865 bytes
Shape of loaded array: (365, 4)
Size of NumPy file: 11808 bytes
Size of Pickle file: 12239 bytes


In [11]:
# Step 1: Save the array to an Excel file
excel_filename = "random_data.xlsx"
df = pd.DataFrame(array)  # Convert array to DataFrame
df.to_excel(excel_filename, index=False, header=False)

# Step 2: Load the Excel file back into a DataFrame
df_loaded = pd.read_excel(excel_filename, header=None)

# Step 3: Print the DataFrame
print(df_loaded)

            0         1         2         3
0    0.644144  0.380748  0.663048  0.163651
1    0.962608  0.346662  0.991751  0.235058
2    0.585694  0.406690  0.136234  0.544136
3    0.518176  0.766855  0.933850  0.089703
4    0.195771  0.994194  0.235180  0.238986
..        ...       ...       ...       ...
360  0.002356  0.603345  0.960803  0.227310
361  0.959882  0.363940  0.208686  0.289246
362  0.373129  0.805238  0.812730  0.625285
363  0.756582  0.562158  0.589113  0.060197
364  0.441393  0.424427  0.816909  0.172645

[365 rows x 4 columns]


In [17]:
import json

# JSON string
json_string = '{"country":"Netherlands","dma_code":"0","timezone":"Europe/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

# Step 1: Parse the JSON string using json.loads()
data = json.loads(json_string)

# Step 2: Print the value for the "country" column
print(f"Original Country: {data['country']}")

# Step 3: Overwrite the value of "country"
data["country"] = "Wakanda"

# Step 4: Print the updated results
print(f"Updated Country: {data['country']}")

Original Country: Netherlands
Updated Country: Wakanda


In [19]:
from io import StringIO

# Step 1: Use StringIO to read the JSON string as a file-like object
json_io = StringIO(f'[{json_string}]')  # Wrap the string in an array for Series

# Step 2: Create a Pandas Series from the JSON string
series = pd.read_json(json_io).squeeze()  # Convert JSON string to a Series

# Step 3: Change the country value to your choice
series['country'] = 'Jabooty' 

# Step 4: Convert the updated Series back to a JSON string
updated_json_string = series.to_json()

# Step 5: Print the updated JSON string
print(updated_json_string)

{"country":"Jabooty","dma_code":0,"timezone":"Europe\/Amsterdam","area_code":0,"ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}


In [45]:
from bs4 import BeautifulSoup
import re

# Parse the HTML
soup = BeautifulSoup(open('loremIpsum.html'))

# Printing the first <div> tag
print("First div\n", soup.div)

First div
 <div class="tile">
<h4>Development</h4>
     0.10.1 - July 2014<br/>
</div>


In [49]:
# Printing the class attribute of the first <div> tag
print("First div class", soup.div['class'])

First div class ['tile']


In [51]:
# Printing the text of the first <dfn> tag
print("First dfn text", soup.dl.dt.dfn.text)

First dfn text Quare attende, quaeso.


In [53]:
for link in soup.find_all('a'):
    print("Link text", link.string, "URL", link.get('href'))

Link text loripsum.net URL http://loripsum.net/
Link text Poterat autem inpune; URL http://loripsum.net/
Link text Is es profecto tu. URL http://loripsum.net/


In [55]:
for i, div in enumerate(soup('div')):
    print(i, div.contents)

0 ['\n', <h4>Development</h4>, '\n     0.10.1 - July 2014', <br/>, '\n']
1 ['\n', <h4>Official Release</h4>, '\n     0.10.0 June 2014', <br/>, '\n']
2 ['\n', <h4>Previous Release</h4>, '\n     0.09.1 June 2013', <br/>, '\n']


In [57]:
official_div = soup.find_all("div", id="official")
print("Official Version", official_div[0].contents[2].strip())

Official Version 0.10.0 June 2014


In [59]:
# Printing number of elements that have any class attribute
print("# elements with class",len(soup.find_all(class_=True)))

# elements with class 3


In [65]:
tile_class = soup.find_all("div", class_="tile")
print("# Tile classes", len(tile_class))

# Tile classes 2


In [67]:
print("# Divs with class containing tile", len(soup.find_all("div", class_=re.compile("tile"))))

# Divs with class containing tile 3


In [69]:
print("Using CSS selector\n", soup.select('div.notile'))

Using CSS selector
 [<div class="notile">
<h4>Previous Release</h4>
     0.09.1 June 2013<br/>
</div>]


In [73]:
print("Selecting ordered list list items\n", soup.select("ol > li")[:2])

Selecting ordered list list items
 [<li>Cur id non ita fit?</li>, <li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]


In [75]:
print("Second list item in ordered list", soup.select("ol>li:nth-of-type(2)"))

Second list item in ordered list [<li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]


In [79]:
print("Searching for text string", soup.find_all(string=re.compile("2014")))

Searching for text string ['\n     0.10.1 - July 2014', '\n     0.10.0 June 2014']
