### A Revision to pandas DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
array_a = np.array([[3, 2, 1], [6, 3, 2]])
array_a

array([[3, 2, 1],
       [6, 3, 2]])

In [3]:
pd.DataFrame(array_a)

Unnamed: 0,0,1,2
0,3,2,1
1,6,3,2


In [4]:
type(pd.DataFrame(array_a))

pandas.core.frame.DataFrame

In [5]:
df = pd.DataFrame(array_a, columns = ['Column 1', 'Column 2', 'Column 3'])
df

Unnamed: 0,Column 1,Column 2,Column 3
0,3,2,1
1,6,3,2


In [6]:
df = pd.DataFrame(array_a, columns = ['Column 1', 'Column 2', 'Column 3'], index = ['Row 1', 'Row 2'])
df

Unnamed: 0,Column 1,Column 2,Column 3
Row 1,3,2,1
Row 2,6,3,2


In [7]:
# Can we change to "StringID"?
data = pd.read_csv('Lending-company.csv', index_col = 'LoanID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,LoanID_1,Product B,Female,Location 3,Region 2,17600.0,04/07/2018,2200,45,365,3221,4166,14621,Active
2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active
3,LoanID_3,Product B,Male,Location 8,Region 3,16600.0,08/12/2016,1000,45,365,2260,3205,16340,
4,LoanID_4,Product A,Male,Location 26,Region 2,17600.0,,2200,45,365,3141,4166,16321,Active
5,LoanID_5,Product B,Female,Location 34,Region 3,21250.0,28/10/2017,2200,55,365,3570,4745,14720,Active


In [8]:
type(lending_co_data)

pandas.core.frame.DataFrame

### Common Attributes for Working with DataFrames

In [9]:
# Can we change to "StringID"?
data = pd.read_csv('Lending-company.csv', index_col = 'LoanID')
lending_co_data = data.copy()
lending_co_data.head()

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,LoanID_1,Product B,Female,Location 3,Region 2,17600.0,04/07/2018,2200,45,365,3221,4166,14621,Active
2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active
3,LoanID_3,Product B,Male,Location 8,Region 3,16600.0,08/12/2016,1000,45,365,2260,3205,16340,
4,LoanID_4,Product A,Male,Location 26,Region 2,17600.0,,2200,45,365,3141,4166,16321,Active
5,LoanID_5,Product B,Female,Location 34,Region 3,21250.0,28/10/2017,2200,55,365,3570,4745,14720,Active


In [10]:
lending_co_data.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043],
           dtype='int64', name='LoanID', length=1043)

In [11]:
type(lending_co_data.index)

pandas.core.indexes.numeric.Int64Index

In [12]:
lending_co_data.columns

Index(['StringID', 'Product', 'CustomerGender', 'Location', 'Region',
       'TotalPrice', 'StartDate', 'Deposit', 'DailyRate', 'TotalDaysYr',
       'AmtPaid36', 'AmtPaid60', 'AmtPaid360', 'LoanStatus'],
      dtype='object')

In [13]:
type(lending_co_data.columns)

pandas.core.indexes.base.Index

In [14]:
lending_co_data.axes

[Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
             ...
             1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043],
            dtype='int64', name='LoanID', length=1043),
 Index(['StringID', 'Product', 'CustomerGender', 'Location', 'Region',
        'TotalPrice', 'StartDate', 'Deposit', 'DailyRate', 'TotalDaysYr',
        'AmtPaid36', 'AmtPaid60', 'AmtPaid360', 'LoanStatus'],
       dtype='object')]

In [15]:
lending_co_data.dtypes

StringID           object
Product            object
CustomerGender     object
Location           object
Region             object
TotalPrice        float64
StartDate          object
Deposit             int64
DailyRate           int64
TotalDaysYr         int64
AmtPaid36           int64
AmtPaid60           int64
AmtPaid360          int64
LoanStatus         object
dtype: object

In [16]:
lending_co_data.values

array([['LoanID_1', 'Product B', 'Female', ..., 4166, 14621, 'Active'],
       ['LoanID_2', 'Product D', 'Female', ..., 4096, 16041, 'Active'],
       ['LoanID_3', 'Product B', 'Male', ..., 3205, 16340, nan],
       ...,
       ['LoanID_1041', 'Product A', 'NotSpecified', ..., 5143, 16617,
        'Finished Payment'],
       ['LoanID_1042', 'Product B', 'Female', ..., 3462, 15617,
        'Finished Payment'],
       ['LoanID_1043', 'Product A', 'NotSpecified', ..., 4743, 16617,
        'Finished Payment']], dtype=object)

In [17]:
type(lending_co_data.values)

numpy.ndarray

In [18]:
lending_co_data.to_numpy()

array([['LoanID_1', 'Product B', 'Female', ..., 4166, 14621, 'Active'],
       ['LoanID_2', 'Product D', 'Female', ..., 4096, 16041, 'Active'],
       ['LoanID_3', 'Product B', 'Male', ..., 3205, 16340, nan],
       ...,
       ['LoanID_1041', 'Product A', 'NotSpecified', ..., 5143, 16617,
        'Finished Payment'],
       ['LoanID_1042', 'Product B', 'Female', ..., 3462, 15617,
        'Finished Payment'],
       ['LoanID_1043', 'Product A', 'NotSpecified', ..., 4743, 16617,
        'Finished Payment']], dtype=object)

In [19]:
type(lending_co_data.to_numpy())

numpy.ndarray

In [20]:
lending_co_data.shape

(1043, 14)

In [21]:
len(lending_co_data.columns)

14

In [22]:
location_data = pd.read_csv('Lending-company.csv', usecols = ['Location'], squeeze = True)
location_data

0        Location 3
1        Location 6
2        Location 8
3       Location 26
4       Location 34
           ...     
1038    Location 73
1039    Location 82
1040    Location 11
1041    Location 26
1042    Location 94
Name: Location, Length: 1043, dtype: object

In [23]:
type(location_data)

pandas.core.series.Series

In [24]:
location_data.shape

(1043,)

### Data Selection in pandas DataFrames

In [25]:
import pandas as pd

In [None]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data.head()

In [None]:
lending_co_data.Product

In [None]:
lending_co_data.Location

In [None]:
lending_co_data['Product']

In [None]:
lending_co_data['Location']

In [None]:
lending_co_data['location']

In [None]:
type(lending_co_data['Location'])

In [None]:
lending_co_data[['Location']]

In [None]:
type(lending_co_data[['Location']])

In [None]:
lending_co_data[['Location', 'Product']].head()

In [None]:
prod_loc = ['Location', 'Product']
lending_co_data[prod_loc].head()

In [None]:
lending_co_data['Product', 'Location']

### Data Selection - Indexing Data with .iloc[]

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data.head()

In [None]:
lending_co_data[1]

In [None]:
lending_co_data[0,1]

In [None]:
lending_co_data['Product']

In [None]:
lending_co_data.iloc[1]

In [None]:
lending_co_data.iloc[1, 3]

In [None]:
lending_co_data.iloc[1,:]

In [None]:
lending_co_data.iloc[:, 3]

In [None]:
type(lending_co_data.iloc[1, 3])

In [None]:
type(lending_co_data.iloc[1, :])

In [None]:
type(lending_co_data.iloc[:, 3])

In [None]:
lending_co_data.iloc[[1, 3], :]

In [None]:
lending_co_data.iloc[:, [3, 1]]

### Data Selection - Indexing Data with .loc[]

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data

In [None]:
lending_co_data.loc['LoanID_3']

In [None]:
lending_co_data.loc['LoanID_3', :]

In [None]:
lending_co_data.loc['LoanID_3', 'Region']

In [None]:
lending_co_data['Location']

In [None]:
lending_co_data.loc['Location']

In [None]:
lending_co_data.loc[:, 'Location']

In [None]:
lending_co_data.loc[:, 'Locations']

### A Few Comments on Using .loc[] and .iloc[]

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('Lending-company.csv', index_col = 'LoanID')
lending_co_data = data.copy()
lending_co_data

In [None]:
lending_co_data.shape

In [None]:
lending_co_data.iloc[1043, :]

In [None]:
lending_co_data.iloc[10000, :]

In [None]:
lending_co_data.iloc[:, 14]

In [None]:
lending_co_data.iloc[:, 13]

In [None]:
lending_co_data.iloc[:, -1]

In [None]:
lending_co_data.head()

In [None]:
# incorrect (single indexer required)
lending_co_data['TotalPrice'].iloc[0, :]

In [None]:
lending_co_data['TotalPrice'].iloc[0]

In [None]:
# avoid for index columns composed of labels/integers
lending_co_data['TotalPrice'][0]

In [None]:
# AVOID
lending_co_data['TotalPrice'][1]

In [None]:
lending_co_data['TotalPrice'].loc[1]

In [None]:
data = pd.read_csv('Lending-company.csv', index_col = 'StringID')
lending_co_data = data.copy()
lending_co_data.head()

In [None]:
lending_co_data['TotalPrice'].iloc[0]

In [None]:
lending_co_data['TotalPrice'].loc['LoanID_1']

In [None]:
# AVOID
lending_co_data['TotalPrice'][0]

In [None]:
# AVOID
lending_co_data['TotalPrice'][1]

In [None]:
# AVOID
lending_co_data['TotalPrice']['LoanID_1']

In [None]:
# AVOID
lending_co_data.iloc[0][5]

In [None]:
lending_co_data.iloc[0, 5]

In [None]:
# AVOID
lending_co_data.iloc[[0, 5]]

In [None]:
lending_co_data.iloc[[0, 5], :]

In [None]:
# AVOID
lending_co_data['TotalPrice']['LoanID_1']

In [None]:
lending_co_data.loc['LoanID_1', 'TotalPrice']

In [None]:
# AVOID
lending_co_data.loc[['LoanID_1', 'LoanID_6']]

In [None]:
lending_co_data.loc[['LoanID_1', 'LoanID_6'], :]

In [None]:
# AVOID
lending_co_data.TotalPrice['LoanID_1']

In [None]:
# AVOID
lending_co_data['TotalPrice'].iloc[[0, 5]]

In [None]:
lending_co_data.loc[:, 'TotalPrice'].iloc[[0,5]]