In [1]:
import pandas as pd
import numpy as np

# Section 1: Data Preparation
# Task: Create a synthetic dataset with a mix of numerical, categorical, and datetime data.
data = {
    'ID': range(1, 101),
    'Category': np.random.choice(['A', 'B', 'C'], size=100),
    'Value': np.random.uniform(10, 100, size=100),
    'Date': pd.date_range(start='2023-01-01', periods=100)
}

df = pd.DataFrame(data)
print("Sample Data:")
print(df.head())

# Section 2: Data Transformation
# Task: Perform the following transformations:
# 1. Add a new column that categorizes 'Value' into bins: Low (<30), Medium (30-70), High (>70).
# 2. Create a pivot table showing the average 'Value' for each 'Category' and 'Date'.

# Adding the 'Value_Category' column
def categorize_value(value):
    if value < 30:
        return 'Low'
    elif value <= 70:
        return 'Medium'
    else:
        return 'High'

df['Value_Category'] = df['Value'].apply(categorize_value)

# Creating the pivot table
pivot_table = df.pivot_table(
    values='Value', 
    index='Category', 
    columns=df['Date'].dt.month, 
    aggfunc='mean', 
    fill_value=0
)

print("\nPivot Table:")
print(pivot_table)

# Section 3: Advanced Data Analysis
# Task: Perform advanced analysis to:
# 1. Identify the top 3 dates with the highest average 'Value' for each category.
# 2. Calculate the cumulative sum of 'Value' for each category over time.

# Top 3 dates with highest average 'Value' for each category
top_dates = df.groupby(['Category', 'Date'])['Value'].mean().reset_index()
top_dates = top_dates.sort_values(['Category', 'Value'], ascending=[True, False])
top_3_dates = top_dates.groupby('Category').head(3)

print("\nTop 3 Dates with Highest Average Value for Each Category:")
print(top_3_dates)

# Cumulative sum of 'Value' for each category
df['Cumulative_Value'] = df.groupby('Category')['Value'].cumsum()

print("\nData with Cumulative Sum:")
print(df[['ID', 'Category', 'Value', 'Cumulative_Value']].head(10))

Sample Data:
   ID Category      Value       Date
0   1        A  26.642576 2023-01-01
1   2        A  35.276570 2023-01-02
2   3        B  56.824672 2023-01-03
3   4        B  56.554104 2023-01-04
4   5        A  67.231503 2023-01-05

Pivot Table:
Date              1          2          3          4
Category                                            
A         50.058524  50.413274  54.616295  85.310595
B         61.426883  71.302422  53.654784  52.043012
C         51.437159  66.159869  60.047393  67.393189

Top 3 Dates with Highest Average Value for Each Category:
   Category       Date      Value
30        A 2023-04-05  93.860380
18        A 2023-02-09  93.822089
26        A 2023-03-15  90.703653
45        B 2023-02-05  99.622698
74        B 2023-04-08  98.101678
58        B 2023-03-05  97.515082
99        C 2023-04-06  95.463327
86        C 2023-02-24  94.016364
83        C 2023-02-11  92.365093

Data with Cumulative Sum:
   ID Category      Value  Cumulative_Value
0   1        A  

In [2]:
# extract information with age greater than 25 from the following list of dictionaries
data = [{"name": "Alice", "age": 28}, {"name": "Bob", "age": 24}, {"name": "Charlie", "age": 30}]

result = [person for person in data if person["age"] > 25]
print(result)

result = list(filter(lambda x: x["age"] > 25, data))
print(result)

[{'name': 'Alice', 'age': 28}, {'name': 'Charlie', 'age': 30}]
[{'name': 'Alice', 'age': 28}, {'name': 'Charlie', 'age': 30}]


In [4]:
# use list comprehension to flatten the matrix
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened = sum(matrix, [])
print(flattened)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [5]:
# use enumerate() for looping to add 5 extra point to each grade in the list, the 5th one add 10 
grades = [88, 92, 78, 65, 50, 94]
grades = [grade + (10 if i == 4 else 5) for i, grade in enumerate(grades)]
print(grades)

[93, 97, 83, 70, 60, 99]


[93, 97, 83, 70, 60, 99]

In [6]:
# filter out elements depend on their index: 
# use list comprehension and enumerate() to get elements with even index
data = [100, 200, 300, 400, 500]
result = data[::2]
print(result)

[100, 300, 500]


[100, 300, 500]


In [7]:
# create a dictionary from lists using zip()
keys = ['name', 'age', 'grade']
values = ['Alice', 25, 'A']
paired = dict(zip(keys,values))
print(paired)

{'name': 'Alice', 'age': 25, 'grade': 'A'}


{'name': 'Alice', 'age': 25, 'grade': 'A'}


In [17]:
from pprint import pprint

# sort the dictionary based on the ages using lambda
students = [
    {'name': "John", 'grade': "A", 'age': 20}, 
    {'name': "Jane", 'grade': "B", 'age': 21}, 
    {'name': "Joss", 'grade': "A+", 'age': 19}, 
    {'name': "Jack", 'grade': "A-", 'age': 16}, 
    {'name': "Dave", 'grade': "C", 'age': 25}, 
]
students_sorted = sorted(students, key=lambda x: x['age']) # if we want a descending order, use reverse=True
pprint(students_sorted, width=60,sort_dicts=False) 

[{'name': 'Jack', 'grade': 'A-', 'age': 16},
 {'name': 'Joss', 'grade': 'A+', 'age': 19},
 {'name': 'John', 'grade': 'A', 'age': 20},
 {'name': 'Jane', 'grade': 'B', 'age': 21},
 {'name': 'Dave', 'grade': 'C', 'age': 25}]


[{'name': 'Jack', 'grade': 'A-', 'age': 16},
 {'name': 'Joss', 'grade': 'A+', 'age': 19},
 {'name': 'John', 'grade': 'A', 'age': 20},
 {'name': 'Jane', 'grade': 'B', 'age': 21},
 {'name': 'Dave', 'grade': 'C', 'age': 25}]

In [18]:
from operator import itemgetter
# Sort by age, then by salary if ages are the same
# use lambda
employees = [
    {'name': 'Alice', 'age': 30, 'salary': 80000},
    {'name': 'Bob', 'age': 25, 'salary': 50000},
    {'name': 'Charlie', 'age': 35, 'salary': 120000},
]
sorted_employees = sorted(employees, key=itemgetter('age', 'salary'))
print(sorted_employees)

[{'name': 'Bob', 'age': 25, 'salary': 50000}, {'name': 'Alice', 'age': 30, 'salary': 80000}, {'name': 'Charlie', 'age': 35, 'salary': 120000}]


[{'name': 'Bob', 'age': 25, 'salary': 50000}, {'name': 'Alice', 'age': 30, 'salary': 80000}, {'name': 'Charlie', 'age': 35, 'salary': 120000}]


In [19]:
# Generators are highly useful in data-heavy applications:

# Reading Large Files: Use generators to read large files line by line without loading the entire file into memory.
# Data Streaming: Stream data entries for real-time data processing.
# Large Calculations: Break down massive calculations into smaller, more manageable chunks.
import random
import time
from datetime import datetime

# Reading Large Files: 
def reading_large(path):
    with open(path) as file:
        for line in file:
            yield line.strip()

def process_line(line):
    print(f"processing: {line[:15]}...")

for line in reading_large('1026__Screen_Observations__daily.csv'):
    process_line(line)

# Data Streaming:
def reading_sensor():
    return {
        'timestamp': datetime.now().isoformat(),
        'temperature': round(random.uniform(18.0, 32.0)),  # Celsius
        'humidity': round(random.uniform(30, 90))         # Percentage
    }

def sensor_data_stream():
    while True:
        yield reading_sensor()
        time.sleep(1)  # Simulate 1-second interval between readings

if __name__ == "__main__":
    print("Starting simulation")
    stream = sensor_data_stream()
    
    try:
        while True:
            reading = next(stream)
            print(
                f"\n• Time: {reading['timestamp']}\n"
                f"• Temp: {reading['temperature']}°C\n"
                f"• Humidity: {reading['humidity']}%"
            )
    except KeyboardInterrupt:
        print("\nStream stopped")

# Large Calculations:

def growing_sequence(chunk_size=1000):
    current = 1
    chunk = []
    while True:
        if current < 10**100:
            current *= 2
            chunk.append(current)
            if len(chunk) == chunk_size:
                yield chunk
                chunk = []
        else:
            current = current // 3
            chunk.append(current)
            if len(chunk) == chunk_size:
                yield chunk
                chunk = []
            if current == 1:
                break
    if chunk:
        yield chunk

for i, chunk in enumerate(growing_sequence(), 1):
    print(f"Chunk {i}: First={chunk[0]:,}, Last={chunk[-1]:,}, Len={len(chunk):,}")
    if i >= 3:  
        break


processing: ﻿Observation ti...
processing: 1942-03-31T21:0...
processing: 1942-04-01T21:0...
processing: 1942-04-02T21:0...
processing: 1942-04-03T21:0...
processing: 1942-04-04T21:0...
processing: 1942-04-05T21:0...
processing: 1942-04-06T21:0...
processing: 1942-04-07T21:0...
processing: 1942-04-08T21:0...
processing: 1942-04-09T21:0...
processing: 1942-04-10T21:0...
processing: 1942-04-11T21:0...
processing: 1942-04-12T21:0...
processing: 1942-04-13T21:0...
processing: 1942-04-14T21:0...
processing: 1942-04-15T21:0...
processing: 1942-04-16T21:0...
processing: 1942-04-17T21:0...
processing: 1942-04-18T21:0...
processing: 1942-04-19T21:0...
processing: 1942-04-20T21:0...
processing: 1942-04-21T21:0...
processing: 1942-04-22T21:0...
processing: 1942-04-23T21:0...
processing: 1942-04-24T21:0...
processing: 1942-04-25T21:0...
processing: 1942-04-26T21:0...
processing: 1942-04-27T21:0...
processing: 1942-04-28T21:0...
processing: 1942-04-29T21:0...
processing: 1942-04-30T21:0...
processi