In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

from IPython.display import display

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
# Set visualization styles
plt.style.use('fivethirtyeight')
sns.set(style="darkgrid")
%matplotlib inline


In [5]:
#---------- 1. BASICS ----------#

# Creating a Series
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
print("Series example:")
print(s)

Series example:
a    1
b    2
c    3
d    4
dtype: int64


In [24]:

# Creating a DataFrame from a dictionary
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [50000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)
print("\nDataFrame example:")
display(df)



DataFrame example:


Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,50000
1,Anna,34,Paris,60000
2,Peter,29,Berlin,55000
3,Linda,42,London,70000


In [None]:
from io import StringIO

csvdata = """
product_id,product_name,category,price,in_stock
1001,Laptop,Electronics,1200.50,True
1002,Desk Chair,Furniture,150.75,True
1003,Coffee Maker,Appliances,89.99,True
1004,Wireless Mouse,Electronics,24.99,False
1005,Bookshelf,Furniture,175.00,True
1006,Blender,Appliances,49.95,False
1007,Monitor,Electronics,299.99,True
1008,Office Desk,Furniture,225.50,True
""".strip()

# Create a simulated CSV file in memory
simulated_file = StringIO(csvdata)
print(simulated_file.getvalue())


# Example of reading the simulated file
df_products = pd.read_csv(simulated_file)
print("\nSimulated CSV file loaded as DataFrame:")
display(df_products)

# Reading data from file
# df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_json('data.json')
# df = pd.read_sql('SELECT * FROM table', connection)



# 2. DATA INSPECTION 

In [8]:
display(df)

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,50000
1,Anna,34,Paris,60000
2,Peter,29,Berlin,55000
3,Linda,42,London,70000


In [12]:

# Basic methods
print("\nBasic DataFrame properties:")
print(f"Shape: {df.shape}")

# [4,5,6,] - list
# (4,5,6) - tuple
# {4,5,6} - set
# {'key':4, 'key2': 5, 'key3': 6} - dict

print(f"we have {df.shape[0] * df.shape[1]} number of values")




Basic DataFrame properties:
Shape: (4, 4)
we have 16 number of values


In [None]:
# Get information about the DataFrame
print("\nDataFrame info:")
df.nfo() i# this does a print behind the scenes so no need to use print/display


DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
 3   Salary  4 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes


In [16]:

print(f"Column names: {df.columns.tolist()}")
print(df)
print()

df_in_espanol = df.copy() # Making a copy of the DataFrame
df_in_espanol.columns = ['Nombre', 'Edad', 'Ciudad', 'Salario']  # Reassigning column names
print(f"Column names: {df_in_espanol.columns.tolist()}")
print(df_in_espanol)
print(df)

Column names: ['Nombre', 'Edad', 'Ciudad', 'Salario']
  Nombre  Edad    Ciudad  Salario
0   John    28  New York    50000
1   Anna    34     Paris    60000
2  Peter    29    Berlin    55000
3  Linda    42    London    70000

Column names: ['Nombre', 'Edad', 'Ciudad', 'Salario']
  Nombre  Edad    Ciudad  Salario
0   John    28  New York    50000
1   Anna    34     Paris    60000
2  Peter    29    Berlin    55000
3  Linda    42    London    70000
  Nombre  Edad    Ciudad  Salario
0   John    28  New York    50000
1   Anna    34     Paris    60000
2  Peter    29    Berlin    55000
3  Linda    42    London    70000


In [17]:
print(f"Row names: {df.index.tolist()}")
print(df)
print()

df_in_espanol.index = ['uno', 'dos', 'tres', 'cuatro']  # Reassigning row names
print(df_in_espanol)

Row names: [0, 1, 2, 3]
  Nombre  Edad    Ciudad  Salario
0   John    28  New York    50000
1   Anna    34     Paris    60000
2  Peter    29    Berlin    55000
3  Linda    42    London    70000

       Nombre  Edad    Ciudad  Salario
uno      John    28  New York    50000
dos      Anna    34     Paris    60000
tres    Peter    29    Berlin    55000
cuatro  Linda    42    London    70000


In [18]:
print(f"Data types:\n{df.dtypes}")


Data types:
Nombre     object
Edad        int64
Ciudad     object
Salario     int64
dtype: object


In [20]:
print("\nFirst 2 rows:")
print(df.head(2))
print("\nLast 2 rows:")
print(df.tail(2))


First 2 rows:
  Nombre  Edad    Ciudad  Salario
0   John    28  New York    50000
1   Anna    34     Paris    60000

Last 2 rows:
  Nombre  Edad  Ciudad  Salario
2  Peter    29  Berlin    55000
3  Linda    42  London    70000


In [21]:

# Summary statistics
display(df)
print("\nSummary statistics:")
display(df.describe())


Unnamed: 0,Nombre,Edad,Ciudad,Salario
0,John,28,New York,50000
1,Anna,34,Paris,60000
2,Peter,29,Berlin,55000
3,Linda,42,London,70000



Summary statistics:


Unnamed: 0,Edad,Salario
count,4.0,4.0
mean,33.25,58750.0
std,6.396614,8539.125638
min,28.0,50000.0
25%,28.75,53750.0
50%,31.5,57500.0
75%,36.0,62500.0
max,42.0,70000.0


In [25]:
print("\nSummary statistics for specific column:")
display(df["Age"].describe())
print("\nSummary statistics for specific columnS:")
display(df[["Age", "Salary"]].describe())
print("\nSummary statistics for specific columnS with custom names:")
df_renamed = df[["Age", "Salary"]].rename(columns={"Age": "Edad", "Salary": "Salario"})
display(df_renamed.describe())
print("\nSummary statistics for cities New York and Paris:")
display(df[df["City"].isin(["New York", "Paris"])].describe())


Summary statistics for specific column:


count     4.000000
mean     33.250000
std       6.396614
min      28.000000
25%      28.750000
50%      31.500000
75%      36.000000
max      42.000000
Name: Age, dtype: float64


Summary statistics for specific columnS:


Unnamed: 0,Age,Salary
count,4.0,4.0
mean,33.25,58750.0
std,6.396614,8539.125638
min,28.0,50000.0
25%,28.75,53750.0
50%,31.5,57500.0
75%,36.0,62500.0
max,42.0,70000.0



Summary statistics for specific columnS with custom names:


Unnamed: 0,Edad,Salario
count,4.0,4.0
mean,33.25,58750.0
std,6.396614,8539.125638
min,28.0,50000.0
25%,28.75,53750.0
50%,31.5,57500.0
75%,36.0,62500.0
max,42.0,70000.0



Summary statistics for cities New York and Paris:


Unnamed: 0,Age,Salary
count,2.0,2.0
mean,31.0,55000.0
std,4.242641,7071.067812
min,28.0,50000.0
25%,29.5,52500.0
50%,31.0,55000.0
75%,32.5,57500.0
max,34.0,60000.0


In [26]:

# Info method
print("\nDataFrame info:")
df.info() # this does a print behind the scenes so no need to use print/display



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
 3   Salary  4 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes



# 3. DATA SELECTION


In [27]:

# Column selection
print("\nSelecting a column:")
print(type(df['Name']))  # Series
print(df['Name'])



Selecting a column:
<class 'pandas.core.series.Series'>
0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object


In [None]:

# Multiple columns
print("\nSelecting multiple columns:")
print(type(df[['Name', 'Age']]))  # DataFrame
display(df[['Name']])


In [37]:

# Row selection with iloc (integer-based)
# iloc is used for integer-location based indexing
# this means you can select rows and columns 
# by their integer index positions
# note: iloc is zero-based and exclusive of the last index
# iloc[0:2] selects rows 0 and 1
print("\nSelecting rows with iloc:")
display(df.iloc[0:2])  # First two rows



Selecting rows with iloc:


Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,50000
1,Anna,34,Paris,60000


In [36]:

# Row selection with loc (label-based)
# loc is used for label-based indexing
# this means you can select rows and columns
# by their labels
# Note: loc includes the last index in the range
# so df.loc[0:2] will include rows with index 0, 1, and 2
print("\nSelecting rows with loc:")
display(df.loc[0:2, ["Age", "City"]])  # Rows with index 0, 1, 2



Selecting rows with loc:


Unnamed: 0,Age,City
0,28,New York
1,34,Paris
2,29,Berlin


In [None]:

# Conditional selection
print("\nConditional selection:")
display(df[df['Age'] > 30])


In [None]:

# Conditional selection with multiple conditions
print("\nConditional selection with multiple conditions:")
# Using & for AND, | for OR
display(df[(df['Age'] > 30) & (df['City'] == 'Paris')])
# Note: & is used for AND, | is used for OR, ~ is used for NOT
display(df[(df['Age'] > 30) | (df['City'] == 'Paris')])
# Conditional selection with negation
display(df[~(df['Age'] > 30)])  


In [None]:

# Conditional selection with isin
print("\nConditional selection with isin:")
display(df[df['City'].isin(['New York', 'Berlin'])])
display(df[~df['City'].isin(['New York', 'Berlin'])])  # Not in the list


In [None]:

# Conditional selection with between
print("\nConditional selection with between:")
display(df[df['Age'].between(30, 40)])  # Age between 30 and 40
display(df[~df['Age'].between(30, 40)])  # Not between 30 and 40


In [None]:

# Conditional selection with query
# this allows you to use and or instead of & and |
# and also allows you to use != instead of ~
# Note: query is a bit slower than the above methods
print("\nConditional selection with query:")
print(df.query('Age > 30 & City == "Paris"'))
print(df.query('Age > 30 | City == "Paris"'))
print(df.query('Age > 30 and City == "Paris"'))  # Logical AND
print(df.query('Age > 30 or City == "Paris"'))  # Logical OR
print(df.query('Age > 30 and City != "Paris"'))  # Logical AND with NOT



# 4. DATA CLEANING


In [None]:

# Create a dataframe with missing values
df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})


In [None]:

print("\nDataFrame with missing values:")
display(df_missing)


In [None]:

# Check for missing values
print("\nMissing value check:")
print(df_missing.isnull().sum())


In [None]:

# Fill missing values
print("\nFilling missing values:")
print(df_missing.fillna(0))  # Fill with 0
print("\nForward fill:")
print(df_missing.ffill())  # Forward fill
print("\nBackward fill:")
print(df_missing.bfill())  # Backward fill


In [None]:

# Drop missing values
print("\nDropping rows with any missing values:")
print(df_missing.dropna())
print("\nDropping columns with any missing values:")
print(df_missing.dropna(axis=1))


In [None]:

# Removing duplicates
df_dup = pd.DataFrame({
    'A': [1, 1, 2, 3],
    'B': [5, 5, 7, 8]
})
print("\nDataFrame with duplicates:")
display(df_dup)
print("\nAfter removing duplicates:")
display(df_dup.drop_duplicates())



# 5. DATA MANIPULATION


In [None]:

data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [50000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)
display(df)

In [None]:

# Add a new column
df['Income_Category'] = ['Medium', 'High', 'Medium', 'High']
print("\nAdding a new column:")
display(df)


In [None]:
# Adding a new column using a function
def categorize_income(income):
    if income < 60000:
        return 'Medium'
    elif income < 70000:
        return 'High'
    else:
        return 'Very High'
    
df['Income_Category'] = df['Salary'].apply(categorize_income)
print("\nAdding a new column using a function:")
display(df)

In [None]:

# Sorting
print("\nSorting by Age (ascending):")
display(df.sort_values('Age'))


In [None]:
print("\nSorting by Age (descending):")
display(df.sort_values('Age', ascending=False))


In [None]:
print("\nSorting by multiple columns:")
display(df.sort_values(['Income_Category', 'Age']))


In [None]:

# Grouping and aggregation
print("\nGrouping by Income Category (mean):")
display(df.groupby('Income_Category')[['Age', 'Salary']].mean())


In [None]:
print("\nGrouping with multiple aggregations:")
display(df.groupby('Income_Category').agg({
    'Age': 'mean',
    'Salary': ['min', 'max', 'mean']
}))


In [None]:

# Pivot tables
print("\nPivot table example:")
pivot = pd.pivot_table(df, values='Salary', index='Income_Category', 
                      columns='City', aggfunc='mean')
display(pivot)


In [None]:
# Pivot table with multiple values
print("\nPivot table with multiple values:")
pivot_multi = pd.pivot_table(df, values=['Age', 'Salary'], index='Income_Category', 
                              columns='City', aggfunc='mean')
display(pivot_multi)


# 6. DATA ANALYSIS


In [None]:

# Apply functions
print("\nApplying a function to a column:")
display(df['Age'].apply(lambda x: 'Young' if x < 30 else 'Older'))


In [None]:

# Create sample time series data
dates = pd.date_range('20210101', periods=6)
ts_df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print("\nTime series data:")
display(ts_df)


In [None]:

# Resampling
# Resampling (daily to weekly) - this will take the mean of the values in each week
# this is useful for downsampling
print("\nResampling (daily to weekly):")
print(ts_df.resample('W').mean())


In [None]:

# Rolling statistics
print("\nRolling mean (window=2):")
display(ts_df.rolling(2).mean())


In [178]:
display(df_products.head())
print(f"Total number of products: {len(df_products)}")
print(f"Total number of categories: {df_products['category'].nunique()}")
print(f"Total number of products in stock: {df_products['in_stock'].sum()}")
print(f"Total number of products out of stock: {len(df_products) - df_products['in_stock'].sum()}")


Unnamed: 0,product_id,product_name,category,price,in_stock
0,1001,Laptop,Electronics,1200.5,True
1,1002,Desk Chair,Furniture,150.75,True
2,1003,Coffee Maker,Appliances,89.99,True
3,1004,Wireless Mouse,Electronics,24.99,False
4,1005,Bookshelf,Furniture,175.0,True


Total number of products: 5
Total number of categories: 3
Total number of products in stock: 4
Total number of products out of stock: 1


In [181]:
# Display the original DataFrame
display(df_products)


Unnamed: 0,product_id,product_name,category,price,in_stock
0,1001,Laptop,Electronics,1200.5,True
1,1002,Desk Chair,Furniture,150.75,True
2,1003,Coffee Maker,Appliances,89.99,True
3,1004,Wireless Mouse,Electronics,24.99,False
4,1005,Bookshelf,Furniture,175.0,True


In [184]:

# Group by 'category' and count how many products are in each category
grouped = df_products.groupby('category')['product_id'].count()
print("Grouped result (without reset_index):")
print(grouped)
print(type(grouped))  # This is a Series

Grouped result (without reset_index):
category
Appliances     1
Electronics    2
Furniture      2
Name: product_id, dtype: int64
<class 'pandas.core.series.Series'>


In [185]:

# Try to filter categories with at least 2 products
# meaning we want to filter the Series 'grouped' 
# This will raise an error because 'product_id' is now a Series
try:
    filtered = grouped[grouped['product_id'] >= 2]
except Exception as e:
    print("\n❌ Error: Cannot use 'product_id' as a column here because it's now a Series:")
    print(e)



❌ Error: Cannot use 'product_id' as a column here because it's now a Series:
'product_id'


In [None]:

# Now fix it with reset_index()
# reset_index() will convert the Series back to a DataFrame
# and make 'category' a column again
# This will allow us to filter the DataFrame
grouped_fixed = grouped.reset_index()
print("\nGrouped result (with reset_index):")
print(grouped_fixed)



Grouped result (with reset_index):
      category  product_id
0   Appliances           1
1  Electronics           2
2    Furniture           2


In [187]:

# Filtering works now
filtered_fixed = grouped_fixed[grouped_fixed['product_id'] >= 2]
print("\n✅ Filtered result (with reset_index):")
print(filtered_fixed)


✅ Filtered result (with reset_index):
      category  product_id
1  Electronics           2
2    Furniture           2



# 7. VISUALIZATION

In [None]:

# Basic plotting
display(df_products.head(10))

print("\nCreating some visualizations...")
plt.figure(figsize=(12, 5))
# Example: Visualize average product price by category using the products dataset
df_products.groupby('category')['price'].mean().plot(kind='bar', title='Average Product Price by Category')
plt.ylabel('Average Price')
plt.xlabel('Category')
plt.tight_layout()
plt.show()


In [None]:

# Sample data for visualization
np.random.seed(42)
data = {
    'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'] * 5,
    'Value1': np.random.normal(0, 1, 40),
    'Value2': np.random.normal(5, 1, 40),
    'Date': pd.date_range('2021-01-01', periods=40)
}

viz_df = pd.DataFrame(data=data)
display(viz_df.head(10))

In [None]:

# Example: Visualize average Value2 by Category as a bar plot (subplot 1)
plt.figure(figsize=(12, 5))

# subplot 1 of 2
# Visualize average Value2 by Category as a bar plot (subplot 1)
plt.subplot(1, 2, 1)
# Group by 'Category' and calculate the mean of 'Value2'
# Plot the average Value2 for each category
# Use a different color for each category
viz_df.groupby('Category')['Value2'].mean().plot(kind='bar', color=['#1f77b4', '#ff7f0e'])
plt.title('Average Value2 by Category')
plt.ylabel('Average Value2')
plt.xlabel('Category')

# subplot 2 of 2
# Visualize Value2 over time as a line plot (subplot 2)
plt.subplot(1, 2, 2)
# Plot Value2 over time
viz_df.plot(x='Date', y='Value2', ax=plt.gca(), legend=False, title='Value2 Over Time')
plt.ylabel('Value2')
plt.xlabel('Date')

plt.tight_layout()
plt.show()


In [None]:

# Seaborn plots
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.boxplot(x='Category', y='Value1', data=viz_df)
plt.title('Box Plot by Category')

plt.subplot(1, 2, 2)
sns.histplot(data=viz_df, x='Value2', kde=True)
plt.title('Value Distribution')
plt.tight_layout()
plt.show()


In [None]:

# This is a scatter plot with a regression line
plt.figure(figsize=(7, 5))
# Scatter plot of Value1 vs Value2 by Category
sns.scatterplot(data=viz_df, x='Value1', y='Value2', hue='Category')
plt.title('Scatter Plot by Category')
# Add a regression line to the scatter plot
# Note: scatter=False means we don't want to plot the scatter points again
# this is useful if you want to plot the regression line only
sns.regplot(data=viz_df, x='Value1', y='Value2', scatter=False, ax=plt.gca())
plt.show()


In [None]:

# Example dashboard layout using matplotlib
# Create a figure with subplots for different visualizations
# This is a simple dashboard layout with 4 subplots
# Note: You can adjust the number of rows and columns as needed
fig, axs = plt.subplots(2, 2, figsize=(14, 10))

# Set the title for the entire figure
fig.suptitle('Sample Dashboard', fontsize=16)
# Create different visualizations in each subplot
# 1. Distribution plot of Value1 setting ax to axs[0, 0] which is the first subplot
sns.histplot(viz_df['Value1'], ax=axs[0, 0], kde=True)
# now we set the title for this subplot
axs[0, 0].set_title('Distribution')
# 2. Time series plot of Value2 setting ax to axs[0, 1] which is the second subplot
# this is a time series plot of Value2 over time
viz_df.plot(x='Date', y='Value2', ax=axs[0, 1], title='Time Series')
# 3. Bar plot of average Value1 by Category setting ax to axs[1, 0] which is the third subplot
# this is a bar plot of average Value1 by Category
# this is useful for comparing the average Value1 for each category
viz_df.groupby('Category')['Value1'].mean().plot(
    kind='bar', ax=axs[1, 0], title='Category Comparison'
)
corr = viz_df[['Value1', 'Value2']].corr()
# 4. Correlation heatmap setting ax to axs[1, 1] which is the fourth subplot
# this is a heatmap of the correlation between Value1 and Value2
# this is useful for visualizing the correlation between two variables 
sns.heatmap(corr, annot=True, cmap='viridis', ax=axs[1, 1])
axs[1, 1].set_title('Correlation')
plt.tight_layout()
plt.show()