# 2 Pandas Data Structure Basics

## Create Your Own Data

### Create a Series

In [2]:

import pandas as pd

In [3]:
s = pd.Series(['banana', 42])
print(s)

0    banana
1        42
dtype: object


In [4]:
# manually assign index values to a series
# by passing a Python list
s = pd.Series(
    data=['Wes McKinney', 'Creator of Pandas'],
    index=['Person', 'Who'],
)

print(s)

Person         Wes McKinney
Who       Creator of Pandas
dtype: object


### Create a DataFrame

In [5]:
scientists = pd.DataFrame(
    {
        "Name": ["Rosaline Franklin", "William Gosset"],
        "Occupation": ["Chemist", "Statistician"],
        "Born": ["1920-07-25", "1876-06-13"],
        "Died": ["1958-04-16", "1937-10-16"],
        "Age": [37, 61],
    }
)

print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


In [6]:
scientists = pd.DataFrame(
    {
        "Occupation": ["Chemist", "Statistician"],
        "Born": ["1920-07-25", "1876-06-13"],
        "Died": ["1958-04-16", "1937-10-16"],
        "Age": [37, 61],
    },
    index=["Rosaline Franklin", "William Gosset"],
    columns=["Occupation", "Born", "Died", "Age"],
)

print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


## The Series

In [7]:
# create our example dataframe
# with a row index label
scientists = pd.DataFrame(
    {
        "Occupation": ["Chemist", "Statistician"],
        "Born": ["1920-07-25", "1876-06-13"],
        "Died": ["1958-04-16", "1937-10-16"],
        "Age": [37, 61],
    },
    index=["Rosaline Franklin", "William Gosset"],
    columns=["Occupation", "Born", "Died", "Age"],
)

print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


In [8]:
# select by row index label
first_row = scientists.loc["William Gosset"]
print(type(first_row))

<class 'pandas.core.series.Series'>


In [9]:
print(first_row)

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


In [10]:
print(first_row.index)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [11]:
print(first_row.values)

['Statistician' '1876-06-13' '1937-10-16' np.int64(61)]


In [12]:
print(first_row.keys())

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [13]:
# get the first index using an attribute
print(first_row.index[0])

Occupation


In [15]:
# get the first index using a method
print(first_row.keys()[0])

Occupation


### The Series Is ndarray-like

#### Series Methods

In [16]:
# get the 'Age' column
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [17]:
# calculate the mean
print(ages.mean())

49.0


In [18]:
# calculate the minimum
print(ages.min())

37


In [19]:
# calculate the maximum
print(ages.max())

61


In [20]:
# calculate the standard deviation
print(ages.std())

16.97056274847714


### Boolean Subsetting: Series

In [21]:
scientists = pd.read_csv("data/scientists.csv")

In [22]:
ages = scientists["Age"]
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [23]:
# get basic stats
print(ages.describe())

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64


In [24]:
# mean of all ages
print(ages.mean())

59.125


In [25]:
print(ages[ages > ages.mean()])

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [26]:
print(ages > ages.mean())

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


In [27]:
print(type(ages > ages.mean()))

<class 'pandas.core.series.Series'>


In [28]:
# get index 0, 1, 4, 5, and 7
manual_bool_values = [
    True, # 0
    True, # 1
    False, # 2
    False, # 3
    True, # 4
    True, # 5
    False, # 6
    True, # 7
]

print(ages[manual_bool_values])

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


### Operations Are Automatically Aligned and Vectorized (Broadcasting)

#### Vectors of the Same Length

In [29]:
print(ages + ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [30]:
print(ages * ages)

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


#### Vectors With Integers

In [31]:
print(ages + 100)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64


In [32]:
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


#### Vectors With Different Lengths

In [33]:
print(ages + pd.Series([1, 100]))

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


#### Vectors With Common Index Labels (Automatic Alignment)

In [34]:
# ages as they appear in the data
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [35]:
rev_ages = ages.sort_index(ascending=False)
print(rev_ages)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


In [36]:
# reference output to show index label alignment
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [37]:
# note how we get the same values
# even though the vector is reversed
print(ages + rev_ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64
