### Reading Data from Different Sources

In [1]:
import pandas as pd
from io import StringIO

In [2]:
Data = '{"employee_name" : "James", "email" : "jamie@gmail.com", "job_profile" : [{"title" : "Software Engineer", "department" : "Payments"}]}'

In [6]:
df = pd.read_json(StringIO(Data))
df

Unnamed: 0,employee_name,email,job_profile
0,James,jamie@gmail.com,"{'title': 'Software Engineer', 'department': '..."


In [7]:
df.to_json()

'{"employee_name":{"0":"James"},"email":{"0":"jamie@gmail.com"},"job_profile":{"0":{"title":"Software Engineer","department":"Payments"}}}'

In [8]:
df.to_json(orient = 'index')

'{"0":{"employee_name":"James","email":"jamie@gmail.com","job_profile":{"title":"Software Engineer","department":"Payments"}}}'

In [10]:
df.to_json(orient = 'records')

'[{"employee_name":"James","email":"jamie@gmail.com","job_profile":{"title":"Software Engineer","department":"Payments"}}]'

In [15]:
online_csv = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header = None)

In [17]:
online_csv.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185


In [19]:
online_csv.to_csv('wine.csv')

In [20]:
!pip install lxml

Collecting lxml
  Downloading lxml-6.0.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading lxml-6.0.0-cp312-cp312-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   -- ------------------------------------- 0.3/4.0 MB ? eta -:--:--
   ------------- -------------------------- 1.3/4.0 MB 3.5 MB/s eta 0:00:01
   ------------------------------- -------- 3.1/4.0 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 4.0/4.0 MB 5.7 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-6.0.0



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
url = 'https://www.fdic.gov/bank-failures/failed-bank-list'

url_df = pd.read_html(url)

In [25]:
url_df

[                               Bank Name           City         State   Cert  \
 0           The Santa Anna National Bank     Santa Anna         Texas   5520   
 1                   Pulaski Savings Bank        Chicago      Illinois  28611   
 2     The First National Bank of Lindsay        Lindsay      Oklahoma   4134   
 3  Republic First Bank dba Republic Bank   Philadelphia  Pennsylvania  27332   
 4                          Citizens Bank       Sac City          Iowa   8758   
 5               Heartland Tri-State Bank        Elkhart        Kansas  25851   
 6                    First Republic Bank  San Francisco    California  59017   
 7                         Signature Bank       New York      New York  57053   
 8                    Silicon Valley Bank    Santa Clara    California  24735   
 9                      Almena State Bank         Almena        Kansas  15426   
 
                  Acquiring Institution      Closing Date  Fund  Sort ascending  
 0            Coleman Cou

In [29]:
# exclude header
pd.read_html(url, match = 'Chicago')[0]

Unnamed: 0,Bank Name,City,State,Cert,Acquiring Institution,Closing Date,Fund Sort ascending
0,The Santa Anna National Bank,Santa Anna,Texas,5520,Coleman County State Bank,"June 27, 2025",10549
1,Pulaski Savings Bank,Chicago,Illinois,28611,Millennium Bank,"January 17, 2025",10548
2,The First National Bank of Lindsay,Lindsay,Oklahoma,4134,"First Bank & Trust Co., Duncan, OK","October 18, 2024",10547
3,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
4,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
5,Heartland Tri-State Bank,Elkhart,Kansas,25851,"Dream First Bank, N.A.","July 28, 2023",10544
6,First Republic Bank,San Francisco,California,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
7,Signature Bank,New York,New York,57053,"Flagstar Bank, N.A.","March 12, 2023",10540
8,Silicon Valley Bank,Santa Clara,California,24735,First Citizens Bank & Trust Company,"March 10, 2023",10539
9,Almena State Bank,Almena,Kansas,15426,Equity Bank,"October 23, 2020",10538


In [36]:
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
df_excel = pd.read_excel('sample-data.xlsx')
df_excel.head(5)

Unnamed: 0,plan_id,rider_id,plan_name,rider_name
0,14,33,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A - Value ...
1,14,32,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A - Value
2,14,88,AIA HSG Max A,Others
3,132,132,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A
4,42,75,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A - Value ...


#### Pickle File
Process of converting a Python object into a byte stream and save into a database/file.

In [39]:
df_excel.to_pickle('df_excel')

In [40]:
pd.read_pickle('df_excel')

Unnamed: 0,plan_id,rider_id,plan_name,rider_name
0,14,33,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A - Value ...
1,14,32,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A - Value
2,14,88,AIA HSG Max A,Others
3,132,132,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A
4,42,75,AIA HSG Max A,AIA HSG Max A - AIA Max VitalHealth A - Value ...
...,...,...,...,...
288,4,88,Singlife Shield Plan 3,Others
289,123,150,Singlife Shield Standard,Singlife Cancer Cover Plus II
290,123,88,Singlife Shield Standard,Others
291,5,12,Singlife Shield Standard,Singlife Cancer Cover Plus II
