# Pandas

### Import Pandas

In [1]:
import pandas as pd
import numpy as np # Often used alongside Pandas

### 1. Series Creation

In [2]:
# Creating a Series from a list
s = pd.Series([10, 20, 30, 40, 50])
print(f"Series from list:\n{s}")

# Creating a Series with custom index
s_indexed = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
print(f"\nSeries with custom index:\n{s_indexed}")

# Creating a Series from a dictionary
data_dict = {'apple': 100, 'banana': 150, 'orange': 80}
s_dict = pd.Series(data_dict)
print(f"\nSeries from dictionary:\n{s_dict}")

Series from list:
0    10
1    20
2    30
3    40
4    50
dtype: int64

Series with custom index:
a    1
b    2
c    3
dtype: int64

Series from dictionary:
apple     100
banana    150
orange     80
dtype: int64


### 2. DataFrame Creation


In [3]:
# Creating a DataFrame from a dictionary of lists (most common way)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'],
    'Salary': [70000, 85000, 60000, 95000, 80000]
}
df = pd.DataFrame(data)
print(f"DataFrame from dictionary of lists:\n{df}")

# Creating a DataFrame from a list of dictionaries (rows)
data_rows = [
    {'Name': 'Frank', 'Age': 35, 'City': 'Boston'},
    {'Name': 'Grace', 'Age': 28, 'City': 'Seattle'}
]
df_rows = pd.DataFrame(data_rows)
print(f"\nDataFrame from list of dictionaries:\n{df_rows}")

# Creating a DataFrame from a NumPy array
np_array = np.random.rand(3, 4)
df_np = pd.DataFrame(np_array, columns=['ColA', 'ColB', 'ColC', 'ColD'])
print(f"\nDataFrame from NumPy array:\n{df_np}")


DataFrame from dictionary of lists:
      Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   85000
2  Charlie   22      Chicago   60000
3    David   32      Houston   95000
4      Eva   29        Miami   80000

DataFrame from list of dictionaries:
    Name  Age     City
0  Frank   35   Boston
1  Grace   28  Seattle

DataFrame from NumPy array:
       ColA      ColB      ColC      ColD
0  0.659843  0.418218  0.927876  0.666191
1  0.007772  0.650945  0.917329  0.735345
2  0.003663  0.128350  0.523684  0.811685


### 3. Data Loading (Example: CSV)

In [4]:
# In a real scenario, you'd load from an actual file:
# df_csv = pd.read_csv('your_data.csv')
# print(f"Loaded DataFrame from CSV (if file existed):\n{df_csv.head()}")

# For demonstration, let's create a dummy CSV file-like object
from io import StringIO
csv_data = """Name,Age,City,Score
Alice,25,NYC,90
Bob,30,LA,85
Charlie,35,CHI,92
David,28,NYC,88
"""
df_loaded = pd.read_csv(StringIO(csv_data))
print(f"DataFrame loaded from 'CSV' string (StringIO):\n{df_loaded}")

DataFrame loaded from 'CSV' string (StringIO):
      Name  Age City  Score
0    Alice   25  NYC     90
1      Bob   30   LA     85
2  Charlie   35  CHI     92
3    David   28  NYC     88


### 4. Data Inspection

In [5]:
print("Original DataFrame (df):\n", df)
print(f"\nFirst 3 rows (df.head(3)):\n{df.head(3)}")
print(f"\nLast 2 rows (df.tail(2)):\n{df.tail(2)}")
print(f"\nDataFrame Info (df.info()):")
df.info() # Provides a summary of the DataFrame (dtypes, non-null counts)
print(f"\nDescriptive Statistics (df.describe()):\n{df.describe()}") # Numerical column stats
print(f"\nDataFrame shape (rows, columns): {df.shape}")
print(f"DataFrame columns: {df.columns.tolist()}")
print(f"DataFrame index: {df.index.tolist()}")

Original DataFrame (df):
       Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   85000
2  Charlie   22      Chicago   60000
3    David   32      Houston   95000
4      Eva   29        Miami   80000

First 3 rows (df.head(3)):
      Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   85000
2  Charlie   22      Chicago   60000

Last 2 rows (df.tail(2)):
    Name  Age     City  Salary
3  David   32  Houston   95000
4    Eva   29    Miami   80000

DataFrame Info (df.info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Salary  5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes

Descriptive Statistics (df.describe()):
             Ag

### 5. Data Selection (Indexing and Slicing)"

In [6]:
print(f"Accessing a single column ('Name'):\n{df['Name']}")
print(f"\nAccessing multiple columns (['Name', 'City']):\n{df[['Name', 'City']]}")

# .loc for label-based indexing
print(f"\nAccessing rows by label (index 0 and 2 using .loc):\n{df.loc[[0, 2]]}")
print(f"\nAccessing specific cell (row 1, 'City' using .loc): {df.loc[1, 'City']}")
print(f"\nSlicing rows and columns using .loc (rows 0-2, 'Age' to 'City'):\n{df.loc[0:2, 'Age':'City']}")

# .iloc for integer-location based indexing
print(f"\nAccessing rows by integer position (index 1 and 3 using .iloc):\n{df.iloc[[1, 3]]}")
print(f"\nAccessing specific cell (row 0, column 0 using .iloc): {df.iloc[0, 0]}")
print(f"\nSlicing rows and columns using .iloc (rows 0-1, columns 1-2):\n{df.iloc[0:2, 1:3]}")

Accessing a single column ('Name'):
0      Alice
1        Bob
2    Charlie
3      David
4        Eva
Name: Name, dtype: object

Accessing multiple columns (['Name', 'City']):
      Name         City
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston
4      Eva        Miami

Accessing rows by label (index 0 and 2 using .loc):
      Name  Age      City  Salary
0    Alice   24  New York   70000
2  Charlie   22   Chicago   60000

Accessing specific cell (row 1, 'City' using .loc): Los Angeles

Slicing rows and columns using .loc (rows 0-2, 'Age' to 'City'):
   Age         City
0   24     New York
1   27  Los Angeles
2   22      Chicago

Accessing rows by integer position (index 1 and 3 using .iloc):
    Name  Age         City  Salary
1    Bob   27  Los Angeles   85000
3  David   32      Houston   95000

Accessing specific cell (row 0, column 0 using .iloc): Alice

Slicing rows and columns using .iloc (rows 0-1, columns 1-2):
   Age         City


### 6. Data Filtering (Conditional Selection)

In [7]:
# Filter rows where Age > 25
older_than_25 = df[df['Age'] > 25]
print(f"Rows where Age > 25:\n{older_than_25}")

# Filter with multiple conditions (Age > 25 AND Salary < 90000)
# Use & for AND, | for OR, enclose each condition in parentheses
complex_filter = df[(df['Age'] > 25) & (df['Salary'] < 90000)]
print(f"\nRows where Age > 25 AND Salary < 90000:\n{complex_filter}")

# Filter using .isin()
cities_of_interest = ['New York', 'Chicago']
filtered_cities = df[df['City'].isin(cities_of_interest)]
print(f"\nRows where City is 'New York' or 'Chicago':\n{filtered_cities}")

Rows where Age > 25:
    Name  Age         City  Salary
1    Bob   27  Los Angeles   85000
3  David   32      Houston   95000
4    Eva   29        Miami   80000

Rows where Age > 25 AND Salary < 90000:
  Name  Age         City  Salary
1  Bob   27  Los Angeles   85000
4  Eva   29        Miami   80000

Rows where City is 'New York' or 'Chicago':
      Name  Age      City  Salary
0    Alice   24  New York   70000
2  Charlie   22   Chicago   60000


### 7. Handling Missing Data (NaN)

In [8]:
df_missing = df.copy()
df_missing.loc[1, 'Salary'] = np.nan # Introduce a missing value
df_missing.loc[3, 'City'] = np.nan
print(f"DataFrame with introduced NaN values:\n{df_missing}")

print(f"\nCheck for missing values (df_missing.isnull()):\n{df_missing.isnull()}")
print(f"\nNumber of missing values per column (df_missing.isnull().sum()):\n{df_missing.isnull().sum()}")

# Drop rows with any missing values
df_dropped_na = df_missing.dropna()
print(f"\nDataFrame after dropping rows with any NA:\n{df_dropped_na}")

# Fill missing 'Salary' with its mean
mean_salary = df_missing['Salary'].mean()
df_filled_salary = df_missing.fillna({'Salary': mean_salary})
print(f"\nDataFrame after filling 'Salary' NA with mean:\n{df_filled_salary}")

# Fill missing 'City' with a specific value
df_filled_city = df_missing.fillna({'City': 'Unknown'})
print(f"\nDataFrame after filling 'City' NA with 'Unknown':\n{df_filled_city}")

DataFrame with introduced NaN values:
      Name  Age         City   Salary
0    Alice   24     New York  70000.0
1      Bob   27  Los Angeles      NaN
2  Charlie   22      Chicago  60000.0
3    David   32          NaN  95000.0
4      Eva   29        Miami  80000.0

Check for missing values (df_missing.isnull()):
    Name    Age   City  Salary
0  False  False  False   False
1  False  False  False    True
2  False  False  False   False
3  False  False   True   False
4  False  False  False   False

Number of missing values per column (df_missing.isnull().sum()):
Name      0
Age       0
City      1
Salary    1
dtype: int64

DataFrame after dropping rows with any NA:
      Name  Age      City   Salary
0    Alice   24  New York  70000.0
2  Charlie   22   Chicago  60000.0
4      Eva   29     Miami  80000.0

DataFrame after filling 'Salary' NA with mean:
      Name  Age         City   Salary
0    Alice   24     New York  70000.0
1      Bob   27  Los Angeles  76250.0
2  Charlie   22      Chica

### 8. Grouping and Aggregation (.groupby())

In [9]:
# Calculate the average salary per city
avg_salary_by_city = df.groupby('City')['Salary'].mean()
print(f"Average Salary by City:\n{avg_salary_by_city}")

# Group by multiple columns and get multiple aggregations
multi_agg = df.groupby('City').agg(
    Avg_Age=('Age', 'mean'),
    Min_Salary=('Salary', 'min'),
    Count=('Name', 'size')
)
print(f"\nMultiple aggregations by City:\n{multi_agg}")

Average Salary by City:
City
Chicago        60000.0
Houston        95000.0
Los Angeles    85000.0
Miami          80000.0
New York       70000.0
Name: Salary, dtype: float64

Multiple aggregations by City:
             Avg_Age  Min_Salary  Count
City                                   
Chicago         22.0       60000      1
Houston         32.0       95000      1
Los Angeles     27.0       85000      1
Miami           29.0       80000      1
New York        24.0       70000      1


### 9. Merging/Joining DataFrames

In [10]:
df_info = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'IT', 'Finance', 'HR']
})
print(f"df_info:\n{df_info}")
print(f"Original df:\n{df}")

# Merge based on 'Name' column (inner join by default)
merged_df = pd.merge(df, df_info, on='Name', how='inner')
print(f"\nMerged DataFrame (df with df_info on 'Name'):\n{merged_df}")

# Example of a left merge (keep all rows from left, add matching from right)
df_projects = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Eve'],
    'Project': ['Alpha', 'Beta', 'Gamma']
})
left_merged_df = pd.merge(df, df_projects, on='Name', how='left')
print(f"\nLeft Merged DataFrame (df left merge with df_projects):\n{left_merged_df}")

df_info:
      Name Department
0    Alice         HR
1      Bob         IT
2  Charlie    Finance
3    David         HR
Original df:
      Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   85000
2  Charlie   22      Chicago   60000
3    David   32      Houston   95000
4      Eva   29        Miami   80000

Merged DataFrame (df with df_info on 'Name'):
      Name  Age         City  Salary Department
0    Alice   24     New York   70000         HR
1      Bob   27  Los Angeles   85000         IT
2  Charlie   22      Chicago   60000    Finance
3    David   32      Houston   95000         HR

Left Merged DataFrame (df left merge with df_projects):
      Name  Age         City  Salary Project
0    Alice   24     New York   70000   Alpha
1      Bob   27  Los Angeles   85000    Beta
2  Charlie   22      Chicago   60000     NaN
3    David   32      Houston   95000     NaN
4      Eva   29        Miami   80000     NaN


### 10. Basic Operations and Transformations

In [11]:
# Add a new column (e.g., 'Bonus' as 10% of 'Salary')
df['Bonus'] = df['Salary'] * 0.10
print(f"DataFrame with 'Bonus' column:\n{df}")

# Apply a function to a column
df['Age_Category'] = df['Age'].apply(lambda x: 'Adult' if x >= 18 else 'Minor')
print(f"\nDataFrame with 'Age_Category' column:\n{df}")

# Sorting values
df_sorted_age = df.sort_values(by='Age', ascending=True)
print(f"\nDataFrame sorted by 'Age' (ascending):\n{df_sorted_age}")

df_sorted_salary_city = df.sort_values(by=['Salary', 'City'], ascending=[False, True])
print(f"\nDataFrame sorted by 'Salary' (desc) then 'City' (asc):\n{df_sorted_salary_city}")

# Renaming columns
df_renamed = df.rename(columns={'Name': 'Full Name', 'Age': 'Years Old'})
print(f"\nDataFrame with renamed columns:\n{df_renamed}")

# Dropping columns/rows
df_no_bonus = df.drop(columns=['Bonus'])
print(f"\nDataFrame after dropping 'Bonus' column:\n{df_no_bonus}")

# df_no_row1 = df.drop(index=1) # Drop row by index label
# print(f"\nDataFrame after dropping row with index 1:\n{df_no_row1}")

DataFrame with 'Bonus' column:
      Name  Age         City  Salary   Bonus
0    Alice   24     New York   70000  7000.0
1      Bob   27  Los Angeles   85000  8500.0
2  Charlie   22      Chicago   60000  6000.0
3    David   32      Houston   95000  9500.0
4      Eva   29        Miami   80000  8000.0

DataFrame with 'Age_Category' column:
      Name  Age         City  Salary   Bonus Age_Category
0    Alice   24     New York   70000  7000.0        Adult
1      Bob   27  Los Angeles   85000  8500.0        Adult
2  Charlie   22      Chicago   60000  6000.0        Adult
3    David   32      Houston   95000  9500.0        Adult
4      Eva   29        Miami   80000  8000.0        Adult

DataFrame sorted by 'Age' (ascending):
      Name  Age         City  Salary   Bonus Age_Category
2  Charlie   22      Chicago   60000  6000.0        Adult
0    Alice   24     New York   70000  7000.0        Adult
1      Bob   27  Los Angeles   85000  8500.0        Adult
4      Eva   29        Miami   80000  80