### Pandas Package

In [2]:
import numpy as np
import pandas as pd

#### Pandas Series

In [3]:
# They are built upon numpy arrays, that's why it is important to understand them

In [4]:
# Let us have a list and an array
list_1 = [1,2,3,4]
array_1 = np.arange(1,5)
print(list_1)
print(array_1)

[1, 2, 3, 4]
[1 2 3 4]


In [5]:
# Let's make a series out of the list and the array
series_1 = pd.Series(list_1)
series_2 = pd.Series(array_1)
print(series_1)
print(series_2)
# The series are the same

0    1
1    2
2    3
3    4
dtype: int64
0    1
1    2
2    3
3    4
dtype: int64


In [6]:
# Basically series are an array concatenated with another array that acts as an index
# We can replace the index for the values of a list of our choice
list_new_index = ['first', 'second', 'third', 'last']
series_3 = pd.Series(list_1, index = list_new_index)
print('Index: ', series_1.index)
print('-')
print(list_1)
print('-')
print(series_1)
print('-')
print(series_3)

Index:  RangeIndex(start=0, stop=4, step=1)
-
[1, 2, 3, 4]
-
0    1
1    2
2    3
3    4
dtype: int64
-
first     1
second    2
third     3
last      4
dtype: int64


### Pandas Dataframe

In [7]:
# We can conceptualize teh DataFrame as a series of lists, that share an index and
# therefore are a Series. Then, the series which share the same index are put together in a DataFrame
# where the index acts as the row identifier and the series as columns which are attributes for the object in 
# the row
# Let's reivew an example:
# 3 Lists with five items each
car_models = ['BMW X1', 'BMW X2', 'BMW X3', 'BMW X4', 'BMWX5']
car_prices = [10000, 15000, 20000, 30000, 76500]
car_weights = [1200, 1400, 1600, 1650, 2000]
# Let's create two series, with models as the index
car_prices_series = pd.Series(car_prices, index = car_models)
car_weights_series = pd.Series(car_weights, index = car_models)
print(car_prices_series)
print('-')
print(car_weights_series)

BMW X1    10000
BMW X2    15000
BMW X3    20000
BMW X4    30000
BMWX5     76500
dtype: int64
-
BMW X1    1200
BMW X2    1400
BMW X3    1600
BMW X4    1650
BMWX5     2000
dtype: int64


In [8]:
# Now, let's create a DataFrame out of the two series
# Note that it takes a dictionary as an input, where each key is the name of a column.
df_cars_info = pd.DataFrame({'Price' : car_prices_series, 'Weight' : car_weights_series})
print(df_cars_info)

        Price  Weight
BMW X1  10000    1200
BMW X2  15000    1400
BMW X3  20000    1600
BMW X4  30000    1650
BMWX5   76500    2000


In [9]:
# Also a DataFrame can be created from an array
# For example:
chess_squares = np.arange(1, 65).reshape(8,8)
chess_df = pd.DataFrame(chess_squares, index=range(1,9), columns=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
print(chess_df)

    A   B   C   D   E   F   G   H
1   1   2   3   4   5   6   7   8
2   9  10  11  12  13  14  15  16
3  17  18  19  20  21  22  23  24
4  25  26  27  28  29  30  31  32
5  33  34  35  36  37  38  39  40
6  41  42  43  44  45  46  47  48
7  49  50  51  52  53  54  55  56
8  57  58  59  60  61  62  63  64


Beautiful!

In [10]:
# Selection in a Series & DataFrames
# There are 3 ways of doing it
# Via the index
element_1 = car_prices_series['BMW X2']
print("Selection via index: ", element_1)
# With the loc method => The loc method selects based upon labels for both columns & rows
chess_squares_loc = chess_df.loc[1 : 4, 'B' : 'E', ]
print('Selection with the loc method: ', chess_squares_loc)
# With the iloc method => The same can be done but instead you are working with integer baswed position for both columns and rows
chess_squares_iloc = chess_df.iloc[:4,3:] # Note that iloc uses [] instead of ()
print('Selection with the iloc method: ', chess_squares_iloc)

Selection via index:  15000
Selection with the loc method:      B   C   D   E
1   2   3   4   5
2  10  11  12  13
3  18  19  20  21
4  26  27  28  29
Selection with the iloc method:      D   E   F   G   H
1   4   5   6   7   8
2  12  13  14  15  16
3  20  21  22  23  24
4  28  29  30  31  32


Once again, beautiful!

In [11]:
# Also, for DataFrames we can select 3 different attributes
# Index, columns, values
# Zum beispiel
chess_df_index = chess_df.index
chess_df_columns = chess_df.columns
chess_df_values =chess_df.values
print('Index: ', chess_df.index)
print('Columns: ', chess_df_columns)
print('Values: ', chess_df_values) # Values is an array of arrays.

Index:  RangeIndex(start=1, stop=9, step=1)
Columns:  Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], dtype='object')
Values:  [[ 1  2  3  4  5  6  7  8]
 [ 9 10 11 12 13 14 15 16]
 [17 18 19 20 21 22 23 24]
 [25 26 27 28 29 30 31 32]
 [33 34 35 36 37 38 39 40]
 [41 42 43 44 45 46 47 48]
 [49 50 51 52 53 54 55 56]
 [57 58 59 60 61 62 63 64]]


In [20]:
# To Select a column
column_selection = chess_df['A']
print('Column Selection: ', column_selection)
# To select a row
row_selection = chess_df.values[0]
print('Selection of a row: ', row_selection)
alternate_row_selection = chess_df.loc[1]
print('Alternate selection of a row: ', alternate_row_selection)
# Condition based selection
condition_row_selection = chess_df.loc[chess_df.values > 32, 'E']
print('Condition based selection: ', condition_row_selection)

Column Selection:  1     1
2     9
3    17
4    25
5    33
6    41
7    49
8    57
Name: A, dtype: int64
Selection of a row:  [1 2 3 4 5 6 7 8]
Alternate selection of a row:  A    1
B    2
C    3
D    4
E    5
F    6
G    7
H    8
Name: 1, dtype: int64
Condition based selection:  5    37
5    37
5    37
5    37
5    37
5    37
5    37
5    37
6    45
6    45
6    45
6    45
6    45
6    45
6    45
6    45
7    53
7    53
7    53
7    53
7    53
7    53
7    53
7    53
8    61
8    61
8    61
8    61
8    61
8    61
8    61
8    61
Name: E, dtype: int64


Functions applied to Pandas DataFrames

In [28]:
df2 = pd.DataFrame(np.arange(10, 34, 2).reshape(2,6))
print(df2)
# Now we will transpose the array
print('Transposed array:', df2.T)


    0   1   2   3   4   5
0  10  12  14  16  18  20
1  22  24  26  28  30  32
Transposed array:     0   1
0  10  22
1  12  24
2  14  26
3  16  28
4  18  30
5  20  32


In [29]:
# Vectorized functions
numbers = range(50, 70, 2)
number_places_list = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
numbers_series = pd.Series(numbers, index = number_places_list)
numbers_series

first      50
second     52
third      54
fourth     56
fifth      58
sixth      60
seventh    62
eighth     64
ninth      66
tenth      68
dtype: int64

In [34]:
# Let's create a new DataFrame
length = [3950, 4000, 4155, 4325, 4500]
df_cars_info_2 = pd.DataFrame({'Price' : car_prices_series})
df_cars_info_2['Length'] = length
print(df_cars_info_2)
# Now, let's measure the length as a percentage of the length of the first car
print((df_cars_info_2 / df_cars_info_2.iloc[0])*100)

        Price  Length
BMW X1  10000    3950
BMW X2  15000    4000
BMW X3  20000    4155
BMW X4  30000    4325
BMWX5   76500    4500
        Price      Length
BMW X1  100.0  100.000000
BMW X2  150.0  101.265823
BMW X3  200.0  105.189873
BMW X4  300.0  109.493671
BMWX5   765.0  113.924051


In [37]:
# Let's talk about index conservation
numbers_2 = range(100, 120, 2)
numbers_2_series = pd.Series(numbers_2, index = number_places_list)
combined_number_series = numbers_series + numbers_2_series
print('Number Series: ', numbers_series)
print('Number Series 2: ', numbers_2_series)
print('Combined Number Series: ', combined_number_series)
print('Observe that indexes are conserved when two series of equal lengths are added together.')

Number Series:  first      50
second     52
third      54
fourth     56
fifth      58
sixth      60
seventh    62
eighth     64
ninth      66
tenth      68
dtype: int64
Number Series 2:  first      100
second     102
third      104
fourth     106
fifth      108
sixth      110
seventh    112
eighth     114
ninth      116
tenth      118
dtype: int64
Combined Number Series:  first      150
second     154
third      158
fourth     162
fifth      166
sixth      170
seventh    174
eighth     178
ninth      182
tenth      186
dtype: int64
Observe that indexes are conserved when two series of equal lengths are added together.


In [45]:
numbers_series_shortSlice = numbers_series[1:7]
numbers_2_series_shortSlice = numbers_2_series[6:8]
print('A slice of the numbers series: ', numbers_series_shortSlice)
print('A slice of the numbers 2 series: ', numbers_2_series_shortSlice)
print('A sum of the previous slices: ')
print(numbers_series_shortSlice.add(numbers_2_series_shortSlice))
print('Observe that values which do not share an index are treated as Nan')

A slice of the numbers series:  second     52
third      54
fourth     56
fifth      58
sixth      60
seventh    62
dtype: int64
A slice of the numbers 2 series:  seventh    112
eighth     114
dtype: int64
A sum of the previous slices: 
eighth       NaN
fifth        NaN
fourth       NaN
second       NaN
seventh    174.0
sixth        NaN
third        NaN
dtype: float64
Observe that values which do not share an index are treated as Nan


In [53]:
# Almost all function in Python accept a parameter to indicate what to do with possible NaN values
# Zum beispiel:
numbers_series_3 = numbers_series_shortSlice.add(numbers_2_series_shortSlice)
numbers_series_3

eighth       NaN
fifth        NaN
fourth       NaN
second       NaN
seventh    174.0
sixth        NaN
third        NaN
dtype: float64

In [56]:
# Nan values in Python
nan_value = np.NaN
print(nan_value)
print('Type of a nan value: ', type(nan_value))
print("Operation with a NaN value: ", 2 * nan_value)
# There are several ways to deal with NaN values in Python
print(numbers_series_3.isnull())
print(numbers_series_3.fillna(0))
print(numbers_series_3.dropna())
print(numbers_series_3)

nan
Type of a nan value:  <class 'float'>
Operation with a NaN value:  nan
eighth      True
fifth       True
fourth      True
second      True
seventh    False
sixth       True
third       True
dtype: bool
eighth       0.0
fifth        0.0
fourth       0.0
second       0.0
seventh    174.0
sixth        0.0
third        0.0
dtype: float64
seventh    174.0
dtype: float64
eighth       NaN
fifth        NaN
fourth       NaN
second       NaN
seventh    174.0
sixth        NaN
third        NaN
dtype: float64
