In [1]:
# Module 4: Data Collection - Case Study Notebook

import pandas as pd
import requests
from bs4 import BeautifulSoup
import sqlite3
import json
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

print("Welcome to the Data Collection Case Study!")

# 1. Introduction to Data Collection

print("\n1. Introduction to Data Collection")
print("In this notebook, we'll explore various data collection methods:")
print("- File-based sources (CSV, JSON)")
print("- Databases (SQLite)")
print("- APIs and Web Scraping")
print("- Data Manipulation with Pandas")

# 2. File-based Data Sources

print("\n2. File-based Data Sources")

# CSV: Using Iris dataset from sklearn
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Save to CSV
iris_df.to_csv('iris_data.csv', index=False)

# Read from CSV
csv_data = pd.read_csv('iris_data.csv')
print("CSV data (first 5 rows):")
print(csv_data.head())

# JSON: Convert iris data to JSON format
iris_json = iris_df.to_json(orient='records')

# Save to JSON file
with open('iris_data.json', 'w') as f:
    json.dump(json.loads(iris_json), f)

# Read from JSON
with open('iris_data.json', 'r') as f:
    json_data = json.load(f)
print("\nJSON data (first 2 records):")
print(json.dumps(json_data[:2], indent=2))

# 3. Database Systems

print("\n3. Database Systems")

# SQLite
conn = sqlite3.connect('iris_database.db')
iris_df.to_sql('iris', conn, if_exists='replace', index=False)

# Query the database
query = "SELECT * FROM iris LIMIT 5"
sql_data = pd.read_sql_query(query, conn)
print("SQL data (first 5 rows):")
print(sql_data)

conn.close()

# 4. APIs and Web Scraping

print("\n4. APIs and Web Scraping")

# RESTful API: Using JSONPlaceholder (a free online REST API)
api_url = "https://jsonplaceholder.typicode.com/posts/1"
response = requests.get(api_url)
api_data = response.json()
print("API data:")
print(json.dumps(api_data, indent=2))

# Web Scraping: Scraping a quote from Quotes to Scrape
url = "http://quotes.toscrape.com"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
quote = soup.find('span', class_='text').get_text()
author = soup.find('small', class_='author').get_text()
print("\nWeb Scraping data:")
print(f"Quote: {quote}")
print(f"Author: {author}")

# 5. Data Manipulation with Pandas

print("\n5. Data Manipulation with Pandas")

# Load the Iris dataset
df = pd.read_csv('iris_data.csv')

# Display basic information about the dataset
print("\nDataset Info:")
print(df.info())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Group by species and calculate mean
species_mean = df.groupby('species').mean()
print("\nMean values for each species:")
print(species_mean)

# Create a new feature
df['sepal_area'] = df['sepal length (cm)'] * df['sepal width (cm)']

# Display correlation matrix
print("\nCorrelation Matrix:")
print(df.corr())

# Visualize the data
plt.figure(figsize=(10, 6))
df.boxplot(column=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], by='species')
plt.title('Iris Features by Species')
plt.suptitle('')
plt.tight_layout()
plt.show()

print("\nCase Study Completed!")
print("We've demonstrated various data collection and manipulation techniques using the Iris dataset.")
print("These skills form the foundation of the data science pipeline and can be applied to more complex datasets and problems.")

Welcome to the Data Collection Case Study!

1. Introduction to Data Collection
In this notebook, we'll explore various data collection methods:
- File-based sources (CSV, JSON)
- Databases (SQLite)
- APIs and Web Scraping
- Data Manipulation with Pandas

2. File-based Data Sources
CSV data (first 5 rows):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  

JSON data (first 2 records):
[
  {
    "sepal length (cm)": 5.1,
    "sepal width (cm)": 3.5,
    "petal length (cm)": 1.4,
    "petal width (cm)

ValueError: could not convert string to float: 'setosa'