# Pandas notes

## Pandas Series (and is advantages over a List)

In [1]:
# Indexing - pd.Series allows for labeled indexing, which means you can access data using labels instead of just integer positions.
import pandas as pd
s = pd.Series([1, 2, 3], index=['a', 'b', 'c']) # if you skip the index, it will be 0, 1, 2, ...
print(s['a'])  # Output: 1

1


In [2]:
# Data Alignement - When adding two series, the data aligns based on the index. If the index does not match, the result will be NaN.
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['b', 'c', 'd'])
print(s1 + s2)

a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64


In [3]:
# Vectorized Operations: pd.Series supports vectorized operations, which means you can perform operations on the entire series without needing to loop through each element.
s = pd.Series([1, 2, 3])
print(s * 2)

0    2
1    4
2    6
dtype: int64


In [4]:
s3 = pd.Series(range(10), index=range(10))
print(s3)
print(s3>5)    # Output: True for values > 5, False for values <= 5
print(s3[s3>5])    # Output: values > 5

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool
6    6
7    7
8    8
9    9
dtype: int64


In [5]:
# Sample dictionary
data = {'a': 1, 'b': 2, 'c': 3, 'd': 4}

# Transform dictionary into pd.Series
series = pd.Series(data)    # key becomes index, value becomes value

# Print the resulting Series
print(series)

a    1
b    2
c    3
d    4
dtype: int64


## Pandas DataFrame

In [6]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

# Transform dictionary into pd.DataFrame
df = pd.DataFrame(data)

# Print the resulting DataFrame
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [7]:
data['Name'].append('David')
data['Age'].append(40)
data['City'].append('Boston')
df = pd.DataFrame(data)
print(df)


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40       Boston


In [8]:
# Filtering DataFrames - 2 methods, or actually 3
# 1. using square brackets
filtered_df = df[['Age', 'City']]    # Output: DataFrame with only 'Age' and 'City' columns
print(filtered_df)                   # Interesting that columns filtering needs double square brackets

filtered_df = df[df['Age'] > 30]    # Output: DataFrame with only rows where 'Age' > 30
print(filtered_df)

filtered_df = df[:2]    # Output: First two rows of the DataFrame
print(filtered_df)

filtered_df = df[0:2][['Age', 'Name']]    # Fixed: Use double square brackets to select multiple columns
print(filtered_df)                        # Intereeesting that rows filtering needs single square brackets

# 2. using dot notation - this however will not work if the column name has spaces or special characters

print(df.Age)

# 3. using loc and iloc
print(df.loc[0])    # Output: First row of the DataFrame

   Age         City
0   25     New York
1   30  Los Angeles
2   35      Chicago
3   40       Boston
      Name  Age     City
2  Charlie   35  Chicago
3    David   40   Boston
    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
   Age   Name
0   25  Alice
1   30    Bob
0    25
1    30
2    35
3    40
Name: Age, dtype: int64
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
