# Python DS (Data Science)
# Siva Jasthi, Ph.D.
# Chief Instructor
# www.learnandhelp.com

- Writing to different file formats.
- Reading from different file formats
- Exploring the file formats
- Data Sources
-- Asking the user for input
-- Generating the data within python program
-- Reading the data from a file
-- Fetching the data from API
-- scraping the web
-- Fetching the data from a database



# Student and StudentCollection

In [None]:
#@title Student class
class Student:
    def __init__(self, name, email, marks):
        self.name = name
        self.email = email
        self.marks = marks

    def __str__(self):
        return f"Name: {self.name}, Email: {self.email}, Marks: {self.marks}"

    def __repr__(self):
        return f"Student(name={self.name}, email={self.email}, marks={self.marks})"


In [None]:
#@title Students Collection
# Reference: https://homepage.net/name_generator/

# Creating 10 student objects
students = [
    Student("John Doe", "john.doe@example.com", 85),
    Student("Jane Smith", "jane.smith@example.com", 90),
    Student("Sam Wilson", "sam.wilson@example.com", 88),
    Student("Alice Johnson", "alice.johnson@example.com", 92),
    Student("Bob Brown", "bob.brown@example.com", 75),
    Student("Chris Evans", "chris.evans@example.com", 89),
    Student("Diana Prince", "diana.prince@example.com", 95),
    Student("Peter Parker", "peter.parker@example.com", 87),
    Student("Bruce Wayne", "bruce.wayne@example.com", 93),
    Student("Clark Kent", "clark.kent@example.com", 91)
]

# Printing the student objects
for student in students:
    print(student)


Name: John Doe, Email: john.doe@example.com, Marks: 85
Name: Jane Smith, Email: jane.smith@example.com, Marks: 90
Name: Sam Wilson, Email: sam.wilson@example.com, Marks: 88
Name: Alice Johnson, Email: alice.johnson@example.com, Marks: 92
Name: Bob Brown, Email: bob.brown@example.com, Marks: 75
Name: Chris Evans, Email: chris.evans@example.com, Marks: 89
Name: Diana Prince, Email: diana.prince@example.com, Marks: 95
Name: Peter Parker, Email: peter.parker@example.com, Marks: 87
Name: Bruce Wayne, Email: bruce.wayne@example.com, Marks: 93
Name: Clark Kent, Email: clark.kent@example.com, Marks: 91


# Writing Data to different formats

In [None]:
#@title 1.CSV format
import csv

def students_to_csv(students, filename='students.csv'):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Writing the header
        writer.writerow(['Name', 'Email', 'Marks'])
        # Writing student data
        for student in students:
            writer.writerow([student.name, student.email, student.marks])
    print(f"Data saved to {filename}")

# Calling the function to save students to CSV
students_to_csv(students)

Data saved to students.csv


In [None]:
#@title 2.Excel format
from openpyxl import Workbook


# Helper function to export students to an Excel file
def students_to_excel(students, filename='students.xlsx'):
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = "Students"

    # Writing headers
    headers = ["Name", "Email", "Marks"]
    sheet.append(headers)

    # Writing each student's data
    for student in students:
        sheet.append([student.name, student.email, student.marks])

    # Saving the file
    workbook.save(filename)
    print(f"Data saved to {filename}")


# Calling the function to save students to Excel
students_to_excel(students)


Data saved to students.xlsx


In [None]:
#@title 3.JSON format
import json

class Student:
    def __init__(self, name, email, marks):
        self.name = name
        self.email = email
        self.marks = marks

    def __str__(self):
        return f"Name: {self.name}, Email: {self.email}, Marks: {self.marks}"

    def __repr__(self):
        return f"Student(name={self.name}, email={self.email}, marks={self.marks})"

    # Method to convert student object to dictionary (for JSON serialization)
    def to_dict(self):
        return {
            'name': self.name,
            'email': self.email,
            'marks': self.marks
        }

# Helper function to export students to JSON
def students_to_json(students, filename='students.json'):
    students_data = [student.to_dict() for student in students]
    with open(filename, 'w') as file:
        json.dump(students_data, file, indent=4)
    print(f"Data saved to {filename}")

# Creating 10 student objects
students = [
    Student("John Doe", "john.doe@example.com", 85),
    Student("Jane Smith", "jane.smith@example.com", 90),
    Student("Sam Wilson", "sam.wilson@example.com", 88),
    Student("Alice Johnson", "alice.johnson@example.com", 92),
    Student("Bob Brown", "bob.brown@example.com", 75),
    Student("Chris Evans", "chris.evans@example.com", 89),
    Student("Diana Prince", "diana.prince@example.com", 95),
    Student("Peter Parker", "peter.parker@example.com", 87),
    Student("Bruce Wayne", "bruce.wayne@example.com", 93),
    Student("Clark Kent", "clark.kent@example.com", 91)
]

# Calling the function to save students to JSON
students_to_json(students)


Data saved to students.json


In [None]:
#@title 4.XML format
import xml.etree.ElementTree as ET

# Helper function to export students to XML
def students_to_xml(students, filename='students.xml'):
    root = ET.Element("students")  # Root element

    for student in students:
        student_elem = ET.SubElement(root, "student")  # Each student is a child of root
        ET.SubElement(student_elem, "name").text = student.name
        ET.SubElement(student_elem, "email").text = student.email
        ET.SubElement(student_elem, "marks").text = str(student.marks)

    # Convert the tree to an ElementTree object and write to file
    tree = ET.ElementTree(root)
    tree.write(filename, encoding='utf-8', xml_declaration=True)
    print(f"Data saved to {filename}")

# Calling the function to save students to XML
students_to_xml(students)


Data saved to students.xml


In [None]:
#@title 5.HTML format

# Helper function to export students to HTML
def students_to_html(students, filename='students.html'):
    with open(filename, 'w') as file:
        # Write HTML header and opening tags
        file.write("<html>\n<head><title>Student Data</title></head>\n<body>\n")
        file.write("<h1>Student Data</h1>\n")
        file.write("<table border='1'>\n")
        file.write("<tr><th>Name</th><th>Email</th><th>Marks</th></tr>\n")

        # Write each student's data in a table row
        for student in students:
            file.write(f"<tr><td>{student.name}</td><td>{student.email}</td><td>{student.marks}</td></tr>\n")

        # Write HTML closing tags
        file.write("</table>\n</body>\n</html>")
    print(f"Data saved to {filename}")


# Calling the function to save students to HTML
students_to_html(students)


Data saved to students.html


In [None]:
#@title 6.Pickle format
import pickle

# Helper function to export students to a binary file using pickle
def students_to_pickle(students, filename='students.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(students, file)
    print(f"Data saved to {filename}")

# Calling the function to save students to a pickle file
students_to_pickle(students)


Data saved to students.pkl


In [None]:
#@title 7.Text format

# Helper Function to write students to a text file
def students_to_text(students, filename='students.txt'):
    with open(filename, 'w') as file:
        for student in students:
            file.write(str(student) + '\n')
    print(f"Data saved to {filename}")

# Call the function to save students to a text file
students_to_text(students)

Data saved to students.txt


In [None]:
#@title 8.PDF format
# Helper Function to write students to a PDF file
!pip install reportlab
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

def students_to_pdf(students, filename='students.pdf'):
    c = canvas.Canvas(filename, pagesize=A4)
    width, height = A4
    y_position = height - 50  # Start position for writing text

    c.setFont("Helvetica", 12)
    c.drawString(100, y_position, "Student Information:")
    y_position -= 20  # Move down for the next line

    for student in students:
        # Write each student's info to the PDF
        c.drawString(100, y_position, str(student))
        y_position -= 20  # Move down for each new student

    c.save()
    print(f"Data saved to {filename}")

# Call the function to save students to a PDF file
students_to_pdf(students)

Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.1/1.9 MB[0m [31m39.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.5
Data saved to students.pdf


In [None]:
#@title 9.DOCX format
!pip install python-docx
from docx import Document


# Helper Function to write students to a DOCX file
def students_to_docx(students, filename='students.docx'):
    doc = Document()
    doc.add_heading('Student Information', level=1)

    for student in students:
        # Add each student's info as a paragraph
        doc.add_paragraph(str(student))

    # Save the document
    doc.save(filename)
    print(f"Data saved to {filename}")

# Call the function to save students to a DOCX file
students_to_docx(students)


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
Data saved to students.docx


In [None]:
#@title 10.RTF format
# !pip install pyrtf-ng

# from pyrtf import *

# Helper Function to write students to an RTF file
def students_to_rtf(students, filename='students.rtf'):
    rtf_content = "{\\rtf1\\ansi\\ansicpg1252\\deff0\\nouicompat\\deflang1033\n"
    rtf_content += "{\\fonttbl{\\f0\\fnil\\fcharset0 Calibri;}}\n"
    rtf_content += "{\\*\\generator Riched20 10.0.18362;}\\viewkind4\\uc1 \n"
    rtf_content += "\\pard\\sa200\\sl276\\slmult1\\f0\\fs22\\lang9 Student Information\\par\n"

    for student in students:
        rtf_content += f"{str(student)}\\par\n"

    rtf_content += "}"

    with open(filename, 'w') as file:
        file.write(rtf_content)
    print(f"Data saved to {filename}")

# Call the function to save students to an RTF file
students_to_rtf(students)

Data saved to students.rtf


In [None]:
#@title 11.SQL format
# Function to generate MySQL SQL insert statements
def students_to_mysql_sql(students, filename='students.sql'):
    with open(filename, 'w') as file:
        for student in students:
            sql_statement = f"INSERT INTO `students` (`name`, `age`, `grade`) VALUES {str(student)};\n"
            file.write(sql_statement)
    print(f"MySQL SQL statements saved to {filename}")

# Call the function to save students as MySQL SQL insert statements
students_to_mysql_sql(students)

'''
CREATE TABLE students (
    id INT AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(100),
    email VARCHAR(100),
    marks INT
);

'''

MySQL SQL statements saved to students.sql


'\nCREATE TABLE students (\n    id INT AUTO_INCREMENT PRIMARY KEY,\n    name VARCHAR(100),\n    email VARCHAR(100),\n    marks INT\n);\n\n'

In [None]:
#@title 12.YAML format
import yaml

# Helper Function to write students to a YAML file
def students_to_yaml(students, filename='students.yaml'):
    # Convert student objects to a list of dictionaries
    students_list = [{'name': student.name, 'email': student.email, 'marks': student.marks} for student in students]

    # Write to YAML file
    with open(filename, 'w') as file:
        yaml.dump(students_list, file, default_flow_style=False)
    print(f"Data saved to {filename}")

# Call the function to save students as YAML
students_to_yaml(students)
lajdf

Data saved to students.yaml


NameError: name 'lajdf' is not defined

# Reading data from different formats

In [None]:
#@title 1.CSV format
# Step 1: Helper function to read students from CSV file
def read_students_from_csv(filename='students.csv'):
    students = []
    with open(filename, mode='r', newline='') as file:
        reader = csv.reader(file)
        # Skip the header row
        next(reader)
        for row in reader:
            name, email, age = row
            students.append(Student(name, email, int(age)))
    return students

# Step 2: Add a new student
def add_student(students, student):
    students.append(student)

# Step 3: Print the student collection
def print_students(students):
    for student in students:
        print(student)

# Assuming there is a CSV file named 'students.csv' with the following content:
# Name,Email,Age
# Alice,alice@example.com,20
# Bob,bob@example.com,21
# Charlie,charlie@example.com,19

# Step 4: Main execution
students = read_students_from_csv()  # Read existing students
# Create new student
new_student = Student("Siva Jasthi", "siva.jasthi@gmail.com", 35)
add_student(students, new_student)  # Add the new student
print_students(students)  # Print the student collection

Name: John Doe, Email: john.doe@example.com, Marks: 85
Name: Jane Smith, Email: jane.smith@example.com, Marks: 90
Name: Sam Wilson, Email: sam.wilson@example.com, Marks: 88
Name: Alice Johnson, Email: alice.johnson@example.com, Marks: 92
Name: Bob Brown, Email: bob.brown@example.com, Marks: 75
Name: Chris Evans, Email: chris.evans@example.com, Marks: 89
Name: Diana Prince, Email: diana.prince@example.com, Marks: 95
Name: Peter Parker, Email: peter.parker@example.com, Marks: 87
Name: Bruce Wayne, Email: bruce.wayne@example.com, Marks: 93
Name: Clark Kent, Email: clark.kent@example.com, Marks: 91
Name: Siva Jasthi, Email: siva.jasthi@gmail.com, Marks: 35


In [None]:
#@title 2.Excel format
from openpyxl import load_workbook

# Helper function to read students from an Excel file
def read_students_from_excel(filename='students.xlsx'):
    # Load the workbook and select the active sheet
    workbook = load_workbook(filename)
    sheet = workbook.active

    # Read the headers from the first row
    headers = [cell.value for cell in sheet[1]]

    # Read the data rows and convert to a list of dictionaries
    students = []
    for row in sheet.iter_rows(min_row=2, values_only=True):  # Skip the header row
        student_data = dict(zip(headers, row))
        students.append(student_data)

    return students

# Calling the function to read students from Excel
students = read_students_from_excel()
print("Students data loaded from Excel:")
for student in students:
    print(student)


In [None]:
#@title 3.JSON format
# Helper function to read students from a JSON file
def read_students_from_json(filename='students.json'):
    with open(filename, 'r') as file:
        students_data = json.load(file)  # Load JSON data from the file

    # Convert JSON data to a list of Student objects
    students = [Student(data['name'], data['email'], data['marks']) for data in students_data]

    return students

# Calling the function to read students from JSON
students = read_students_from_json()
print("Students data loaded from JSON:")
for student in students:
    print(student)


In [None]:
#@title 4.XML format
# Helper function to read students from an XML file
def read_students_from_xml(filename='students.xml'):
    tree = ET.parse(filename)  # Parse the XML file
    root = tree.getroot()  # Get the root element

    students = []
    for student_elem in root.findall("student"):  # Find all <student> elements
        name = student_elem.find("name").text
        email = student_elem.find("email").text
        marks = int(student_elem.find("marks").text)  # Convert marks to integer

        # Create a Student object and append it to the list
        students.append(Student(name, email, marks))

    return students

# Calling the function to read students from XML
students = read_students_from_xml()
print("Students data loaded from XML:")
for student in students:
    print(student)


In [None]:
#@title 5.HTML format
from bs4 import BeautifulSoup

# Helper function to read students from an HTML file
def read_students_from_html(filename='students.html'):
    with open(filename, 'r') as file:
        soup = BeautifulSoup(file, 'html.parser')  # Parse the HTML content

    # Find the table rows containing student data
    rows = soup.find_all('tr')[1:]  # Skip the header row
    students = []

    for row in rows:
        cells = row.find_all('td')  # Get all <td> elements in the row
        name = cells[0].text.strip()
        email = cells[1].text.strip()
        marks = int(cells[2].text.strip())  # Convert marks to integer

        # Create a Student object and append it to the list
        students.append(Student(name, email, marks))

    return students

# Calling the function to read students from HTML
students = read_students_from_html()
print("Students data loaded from HTML:")
for student in students:
    print(student)


In [None]:
#@title 6.Pickle format
# Helper function to read students from a pickle file
def read_students_from_pickle(filename='students.pkl'):
    with open(filename, 'rb') as file:
        students = pickle.load(file)  # Load the data using pickle
    return students

# Calling the function to read students from pickle
students = read_students_from_pickle()
print("Students data loaded from Pickle:")
for student in students:
    print(student)


In [None]:
#@title 7.Text format
# Helper function to read students from a text file
def read_students_from_text(filename='students.txt'):
    students = []
    with open(filename, 'r') as file:
        for line in file:
            # Parse each line to extract student data
            # Assuming the line format is: "Name: <name>, Email: <email>, Marks: <marks>"
            parts = line.strip().split(", ")
            name = parts[0].split(": ")[1]
            email = parts[1].split(": ")[1]
            marks = int(parts[2].split(": ")[1])  # Convert marks to integer

            # Create a Student object and append it to the list
            students.append(Student(name, email, marks))
    return students

# Calling the function to read students from a text file
students = read_students_from_text()
print("Students data loaded from text file:")
for student in students:
    print(student)


In [None]:
#@title 8.PDF format
import pdfplumber

# Helper function to read students from a PDF file
def read_students_from_pdf(filename='students.pdf'):
    students = []
    with pdfplumber.open(filename) as pdf:
        # Loop through all pages of the PDF
        for page in pdf.pages:
            text = page.extract_text()

            # Split text by lines and process each line
            lines = text.split('\n')
            for line in lines:
                # Check if the line has student data (simple check for "Name: ")
                if "Name:" in line:
                    # Parse the student data (assuming format "Name: <name>, Email: <email>, Marks: <marks>")
                    parts = line.split(", ")
                    name = parts[0].split(": ")[1]
                    email = parts[1].split(": ")[1]
                    marks = int(parts[2].split(": ")[1])  # Convert marks to integer

                    # Create a Student object and append to the list
                    students.append(Student(name, email, marks))
    return students

# Calling the function to read students from a PDF file
students = read_students_from_pdf()
print("Students data loaded from PDF:")
for student in students:
    print(student)


In [None]:
#@title 9.DOCX format
from docx import Document

# Helper function to read students from a DOCX file
def read_students_from_docx(filename='students.docx'):
    students = []
    doc = Document(filename)

    # Skip the heading and read paragraphs
    for para in doc.paragraphs[1:]:  # Assume the first paragraph is the heading
        line = para.text.strip()
        if not line:  # Skip empty lines
            continue

        # Parse student data (format assumed: "Name: <name>, Email: <email>, Marks: <marks>")
        parts = line.split(", ")
        name = parts[0].split(": ")[1]
        email = parts[1].split(": ")[1]
        marks = int(parts[2].split(": ")[1])  # Convert marks to integer

        # Create a Student object and append to the list
        students.append(Student(name, email, marks))
    return students

# Calling the function to read students from a DOCX file
students = read_students_from_docx()
print("Students data loaded from DOCX:")
for student in students:
    print(student)


In [None]:
#@title 10.RTF format
# Helper function to read students from an RTF file
def read_students_from_rtf(filename='students.rtf'):
    students = []
    with open(filename, 'r') as file:
        content = file.read()

    # Extract text data from RTF content (ignoring RTF markup)
    text_lines = content.split("\\par\n")  # RTF lines are separated by "\par"

    for line in text_lines:
        # Clean RTF formatting
        line = line.strip()
        if not line or line.startswith("{\\") or line == "}":  # Skip metadata
            continue

        # Parse student data (format assumed: "Name: <name>, Email: <email>, Marks: <marks>")
        parts = line.split(", ")
        name = parts[0].split(": ")[1]
        email = parts[1].split(": ")[1]
        marks = int(parts[2].split(": ")[1])  # Convert marks to integer

        # Create a Student object and append to the list
        students.append(Student(name, email, marks))
    return students

# Calling the function to read students from an RTF file
students = read_students_from_rtf()
print("Students data loaded from RTF:")
for student in students:
    print(student)


In [None]:
#@title 11.SQL format

In [None]:
#@title 12.YAML format
# Helper function to read students from a YAML file
def read_students_from_yaml(filename='students.yaml'):
    with open(filename, 'r') as file:
        students_list = yaml.safe_load(file)  # Parse YAML content into Python objects

    # Convert each dictionary back into a Student object
    students = [Student(student['name'], student['email'], student['marks']) for student in students_list]
    return students

# Calling the function to read students from a YAML file
students = read_students_from_yaml()
print("Students data loaded from YAML:")
for student in students:
    print(student)


# Reading and writing data using Pandas
# Read the data from file. Create a dataframe
# Add a new student to that dataframe
# Save the dataframe into excel (csv_to_excel.xlsx) (html_to_excel.xslx) and so on