1. Pandas: Read CSV

In [2]:
import pandas as pd

df = pd.read_csv('sample_data.csv')
print(df.head())

   id     name    status  amount        date
0   1    Alice  Approved    1200  2023-01-10
1   2      Bob  Rejected     850  2023-02-15
2   3  Charlie  Approved     960  2023-03-12
3   4    Diana   Pending     740  2023-04-20
4   5      Eli  Approved    1320  2023-01-25


2. Filter using query

In [3]:
filtered = df.query("status == 'Approved'")
print(filtered.head())

   id     name    status  amount        date
0   1    Alice  Approved    1200  2023-01-10
2   3  Charlie  Approved     960  2023-03-12
4   5      Eli  Approved    1320  2023-01-25
6   7   George  Approved    1500  2023-03-30
9  10     Jane  Approved    1100  2023-05-05


3. Count by Status

In [4]:
status_count = df['status'].value_counts()
print(status_count)

status
Approved    5
Rejected    3
Pending     2
Name: count, dtype: int64


4. Count by Month & Status

In [5]:
df['date'] = pd.to_datetime(df['date'])
monthly_status = df.groupby([df['date'].dt.month, 'status']).size()
print(monthly_status)

date  status  
1     Approved    2
2     Rejected    2
3     Approved    2
4     Pending     2
5     Approved    1
      Rejected    1
dtype: int64


5. Create DataFrame with dynamic columns

In [6]:
columns = list(df.columns)
custom_df = pd.DataFrame(columns=columns)
print("Empty DataFrame with same columns:\n", custom_df)

Empty DataFrame with same columns:
 Empty DataFrame
Columns: [id, name, status, amount, date]
Index: []


6. Inner Join

In [7]:
df1 = pd.DataFrame({'id': [1, 2], 'val1': ['A', 'B']})
df2 = pd.DataFrame({'id': [1, 3], 'val2': ['X', 'Y']})
joined = pd.merge(df1, df2, on='id', how='inner')
print(joined)

   id val1 val2
0   1    A    X


7. Aggregation on Join

In [8]:
df_agg = joined.groupby('val1').agg({'id': 'count'})
print(df_agg)

      id
val1    
A      1


8. Sort DataFrame

In [9]:
sorted_df = df.sort_values(by='date', ascending=False)
print(sorted_df.head())

   id    name    status  amount       date
8   9     Ian  Rejected     600 2023-05-22
9  10    Jane  Approved    1100 2023-05-05
3   4   Diana   Pending     740 2023-04-20
7   8  Hannah   Pending     900 2023-04-18
6   7  George  Approved    1500 2023-03-30


9. Write DataFrame to File

In [10]:
df.head(10).to_csv("output.csv", index=False)

10. Write DataFrame to JSON

In [11]:
df.head(5).to_json("output.json", orient="records", lines=True)

----

### Data Processing with Python

NumPy for Computing

In [12]:
import numpy as np

a = np.array([1, 2, 3])
print("Mean:", a.mean())
print("Standard Deviation:", a.std())

Mean: 2.0
Standard Deviation: 0.816496580927726


Exploring Dataset

In [13]:
print(df.describe())
print(df.info())

             id       amount                 date
count  10.00000    10.000000                   10
mean    5.50000   985.000000  2023-03-17 16:48:00
min     1.00000   600.000000  2023-01-10 00:00:00
25%     3.25000   767.500000  2023-02-11 06:00:00
50%     5.50000   930.000000  2023-03-21 00:00:00
75%     7.75000  1175.000000  2023-04-19 12:00:00
max    10.00000  1500.000000  2023-05-22 00:00:00
std     3.02765   291.404644                  NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      10 non-null     int64         
 1   name    10 non-null     object        
 2   status  10 non-null     object        
 3   amount  10 non-null     int64         
 4   date    10 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 532.0+ bytes
None


Data Cleaning & Preprocessing

In [14]:
df.dropna(inplace=True)
df['status'] = df['status'].str.strip().str.lower()

Simple Data Analysis

In [15]:
print("Average Value by Status:\n", df.groupby('status')['amount'].mean())

Average Value by Status:
 status
approved    1216.0
pending      820.0
rejected     710.0
Name: amount, dtype: float64


Merge & Transform

In [16]:
df_merged = pd.merge(df1, df2, on="id", how="outer")
df_merged["full"] = df_merged["val1"].fillna("") + df_merged["val2"].fillna("")
print(df_merged)

   id val1 val2 full
0   1    A    X   AX
1   2    B  NaN    B
2   3  NaN    Y    Y


Error Handling

In [17]:
try:
    result = 10 / 0
except ZeroDivisionError:
    print("Can't divide by zero")
finally:
    print("End of error handling")

Can't divide by zero
End of error handling
