In [13]:
import pandas as pd
import numpy as np

# Section 1: Data Preparation
# Task: Create a synthetic dataset with a mix of numerical, categorical, and datetime data.
data = {
    'ID': range(1, 101),
    'Category': np.random.choice(['A', 'B', 'C'], size=100),
    'Value': np.random.uniform(10, 100, size=100),
    'Date': pd.date_range(start='2023-01-01', periods=100)
}

df = pd.DataFrame(data)
print("Sample Data:")
print(df.head())

Sample Data:
   ID Category      Value       Date
0   1        A  27.233823 2023-01-01
1   2        C  22.951585 2023-01-02
2   3        C  35.309145 2023-01-03
3   4        A  86.008976 2023-01-04
4   5        B  32.980169 2023-01-05


In [17]:
# Section 2: Data Transformation
# Task: Perform the following transformations:
# 1. Add a new column that categorizes 'Value' into bins: Low (<30), Medium (30-70), High (>70).
# 2. Create a pivot table showing the average 'Value' for each 'Category' and 'Date'.

# Adding the 'Value_Category' column
def categorize_value(value):
    if value < 30:
        return 'Low'
    elif value <= 70:
        return 'Medium'
    else:
        return 'High'
df['Value_Category'] = df['Value'].apply(categorize_value)

# Creating the pivot table
pivot_table = df.pivot_table(
    values='Value', 
    index='Category', 
    columns=df['Date'].dt.month, 
    aggfunc='mean', 
    fill_value=0
)

print("\nPivot Table:")
print(pivot_table)


Pivot Table:
Date              1          2          3          4
Category                                            
A         62.120115  56.890140  61.767726  52.847438
B         66.527037  58.129378  45.954888  59.712109
C         62.793077  55.797798  61.731941  63.717811


In [20]:
# Section 3: Advanced Data Analysis
# Task: Perform advanced analysis to:
# 1. Identify the top 3 dates with the highest average 'Value' for each category.
# 2. Calculate the cumulative sum of 'Value' for each category over time.

# Top 3 dates with highest average 'Value' for each category
top_dates = df.groupby(['Category', 'Date'])['Value'].mean().reset_index()
top_dates = top_dates.sort_values(['Category', 'Value'], ascending=[True, False])
top_3_dates = top_dates.groupby('Category').head(3)

print("\nTop 3 Dates with Highest Average Value for Each Category:")
print(top_3_dates)

# Cumulative sum of 'Value' for each category
df['Cumulative_Value'] = df.groupby('Category')['Value'].cumsum()

print("\nData with Cumulative Sum:")
print(df[['ID', 'Category', 'Value', 'Cumulative_Value']].head(10))


Top 3 Dates with Highest Average Value for Each Category:
   Category       Date      Value
3         A 2023-01-14  99.871627
34        A 2023-03-31  97.032069
32        A 2023-03-26  95.958277
57        B 2023-03-01  95.513430
68        B 2023-04-04  94.774351
50        B 2023-02-16  93.642752
82        C 2023-02-06  99.975870
83        C 2023-02-07  98.823185
71        C 2023-01-06  96.173516

Data with Cumulative Sum:
   ID Category      Value  Cumulative_Value
0   1        A  27.233823         27.233823
1   2        C  22.951585         22.951585
2   3        C  35.309145         58.260731
3   4        A  86.008976        113.242799
4   5        B  32.980169         32.980169
5   6        C  96.173516        154.434247
6   7        C  72.491735        226.925982
7   8        B  92.117814        125.097983
8   9        C  70.435996        297.361978
9  10        A  20.372385        133.615184


In [1]:
# extract information with age greater than 25 from the following list of dictionaries
data = [{"name": "Alice", "age": 28}, {"name": "Bob", "age": 24}, {"name": "Charlie", "age": 30}]
extracted = [person for person in data if person["age"] > 25]
print(extracted)

[{'name': 'Alice', 'age': 28}, {'name': 'Charlie', 'age': 30}]


In [3]:
# use list comprehension to flatten the matrix
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened = [num for nums in matrix for num in nums]
print(flattened)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [4]:
import numpy as np
res = np.array(matrix).flatten()
print(res)

[1 2 3 4 5 6 7 8 9]


In [5]:
# use enumerate() for looping to add 5 extra point to each grade in the list, the 5th one add 10 
grades = [88, 92, 78, 65, 50, 94]
for i,grade in enumerate(grades):
  if i == 4:
    grades[i] = grade +10
  else:
    grades[i] = grade +5
print(grades)

[93, 97, 83, 70, 60, 99]


In [7]:
# filter out elements depend on their index: 
# use list comprehension and enumerate() to get elements with even index
data = [100, 200, 300, 400, 500]
new_list = [num for i,num in enumerate(data, start=1) if i%2 != 0]
print(new_list)

[100, 300, 500]


In [8]:
# create a dictionary from lists using zip()
keys = ['name', 'age', 'grade']
values = ['Alice', 25, 'A']
dic = dict(zip(keys,values))
print(dic)

{'name': 'Alice', 'age': 25, 'grade': 'A'}


In [1]:
# sort the dictionary based on the ages using lambda
students = [
    {'name': "John", 'grade': "A", 'age': 20}, 
    {'name': "Jane", 'grade': "B", 'age': 21}, 
    {'name': "Joss", 'grade': "A+", 'age': 19}, 
    {'name': "Jack", 'grade': "A-", 'age': 16}, 
    {'name': "Dave", 'grade': "C", 'age': 25}, 
]
students.sort(key=lambda x:x["age"])
print(students)

[{'name': 'Jack', 'grade': 'A-', 'age': 16}, {'name': 'Joss', 'grade': 'A+', 'age': 19}, {'name': 'John', 'grade': 'A', 'age': 20}, {'name': 'Jane', 'grade': 'B', 'age': 21}, {'name': 'Dave', 'grade': 'C', 'age': 25}]


In [2]:
# Sort by age, then by salary if ages are the same
# use lambda
employees = [
    {'name': 'Alice', 'age': 30, 'salary': 80000},
    {'name': 'Bob', 'age': 25, 'salary': 50000},
    {'name': 'Charlie', 'age': 35, 'salary': 120000},
]
employees.sort(key=lambda x:(x["age"], x['salary']))
print(employees)

[{'name': 'Bob', 'age': 25, 'salary': 50000}, {'name': 'Alice', 'age': 30, 'salary': 80000}, {'name': 'Charlie', 'age': 35, 'salary': 120000}]


In [45]:
# Generators are highly useful in data-heavy applications:

# Reading Large Files: Use generators to read large files line by line without loading the entire file into memory.
# Data Streaming: Stream data entries for real-time data processing.
# Large Calculations: Break down massive calculations into smaller, more manageable chunks.