## Import the libraries

In [2]:
import pandas as pd
import numpy as np

## Read the csv data

In [5]:
df = pd.read_csv("raw_data/nls97.csv")

## Check the index

In [9]:
df.index

RangeIndex(start=0, stop=8984, step=1)

## Overview of the dataframe

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8984 entries, 0 to 8983
Data columns (total 89 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   personid               8984 non-null   int64  
 1   gender                 8984 non-null   object 
 2   birthmonth             8984 non-null   int64  
 3   birthyear              8984 non-null   int64  
 4   highestgradecompleted  6663 non-null   float64
 5   maritalstatus          6672 non-null   object 
 6   childathome            4791 non-null   float64
 7   childnotathome         4791 non-null   float64
 8   wageincome             5091 non-null   float64
 9   weeklyhrscomputer      5792 non-null   object 
 10  weeklyhrstv            6711 non-null   object 
 11  nightlyhrssleep        6706 non-null   float64
 12  satverbal              1406 non-null   float64
 13  satmath                1407 non-null   float64
 14  gpaoverall             6004 non-null   float64
 15  gpae

## Set the personid column as the index

In [17]:
df.set_index("personid", inplace = True)

In [19]:
df.index

Index([100061, 100139, 100284, 100292, 100583, 100833, 100931, 101089, 101122,
       101132,
       ...
       998997, 999031, 999053, 999087, 999103, 999291, 999406, 999543, 999698,
       999963],
      dtype='int64', name='personid', length=8984)

## Convert all columns with "object" datatype to "category"

In [22]:
for col in df.select_dtypes(['object']).columns:
    df[col] = df[col].astype('category')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   birthmonth             8984 non-null   int64   
 2   birthyear              8984 non-null   int64   
 3   highestgradecompleted  6663 non-null   float64 
 4   maritalstatus          6672 non-null   category
 5   childathome            4791 non-null   float64 
 6   childnotathome         4791 non-null   float64 
 7   wageincome             5091 non-null   float64 
 8   weeklyhrscomputer      5792 non-null   category
 9   weeklyhrstv            6711 non-null   category
 10  nightlyhrssleep        6706 non-null   float64 
 11  satverbal              1406 non-null   float64 
 12  satmath                1407 non-null   float64 
 13  gpaoverall             6004 non-null   float64 
 14  gpaenglish             5798 non-null  

## Select a column using "[ ]"

In [27]:
gender_data = df['gender']

In [29]:
type(gender_data)

pandas.core.series.Series

### Get the column data as a dataframe

In [32]:
gender_data_as_df = df[['gender']]

In [34]:
type(gender_data_as_df)

pandas.core.frame.DataFrame

## Select column using "loc:" selector

In [38]:
gender_data_with_loc = df.loc[:,['gender']]

In [40]:
type(gender_data_with_loc)

pandas.core.frame.DataFrame

## Select column using "iloc:" selector

In [46]:
gender_data_with_iloc = df.iloc[:,[0]]

In [48]:
gender_data_with_iloc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   gender  8984 non-null   category
dtypes: category(1)
memory usage: 79.1 KB


## Select multiple columns from dataframe using "loc"

In [51]:
df_gen_ms_hg_with_loc = df.loc[:,['gender', 'maritalstatus','highestgradecompleted']]
                   

In [53]:
df_gen_ms_hg_with_loc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   highestgradecompleted  6663 non-null   float64 
dtypes: category(2), float64(1)
memory usage: 158.2 KB


## Select multiple columns based on a list of columns

In [56]:
req_cols = ['gender', 'maritalstatus', 'highestgradecompleted', 'wageincome',\
            'gpaoverall', 'weeksworked17']

df_req = df[req_cols]

In [58]:
df_req.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   highestgradecompleted  6663 non-null   float64 
 3   wageincome             5091 non-null   float64 
 4   gpaoverall             6004 non-null   float64 
 5   weeksworked17          6670 non-null   float64 
dtypes: category(2), float64(4)
memory usage: 368.8 KB


## Select columns by using "filter"

In [61]:
df.columns

Index(['gender', 'birthmonth', 'birthyear', 'highestgradecompleted',
       'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
       'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep', 'satverbal',
       'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience',
       'highestdegree', 'govprovidejobs', 'govpricecontrols', 'govhealthcare',
       'govelderliving', 'govindhelp', 'govunemp', 'govincomediff',
       'govcollegefinance', 'govdecenthousing', 'govprotectenvironment',
       'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
       'weeksworked04', 'weeksworked05', 'weeksworked06', 'weeksworked07',
       'weeksworked08', 'weeksworked09', 'weeksworked10', 'weeksworked11',
       'weeksworked12', 'weeksworked13', 'weeksworked14', 'weeksworked15',
       'weeksworked16', 'weeksworked17', 'colenrfeb97', 'colenroct97',
       'colenrfeb98', 'colenroct98', 'colenrfeb99', 'colenroct99',
       'colenrfeb00', 'colenroct00', 'colenrfeb01', 'col

In [63]:
work_df = df.filter(like = "weeksworked")

In [65]:
work_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   weeksworked00  8603 non-null   float64
 1   weeksworked01  8564 non-null   float64
 2   weeksworked02  8556 non-null   float64
 3   weeksworked03  8490 non-null   float64
 4   weeksworked04  8458 non-null   float64
 5   weeksworked05  8403 non-null   float64
 6   weeksworked06  8340 non-null   float64
 7   weeksworked07  8272 non-null   float64
 8   weeksworked08  8186 non-null   float64
 9   weeksworked09  8146 non-null   float64
 10  weeksworked10  8054 non-null   float64
 11  weeksworked11  7968 non-null   float64
 12  weeksworked12  7747 non-null   float64
 13  weeksworked13  7680 non-null   float64
 14  weeksworked14  7612 non-null   float64
 15  weeksworked15  7389 non-null   float64
 16  weeksworked16  7068 non-null   float64
 17  weeksworked17  6670 non-null   float64
dtypes: flo

## Select columns based on "datatype"

In [90]:
df.dtypes.value_counts()

float64     29
category    29
category    12
category    10
int64        2
category     1
category     1
category     1
category     1
category     1
category     1
Name: count, dtype: int64

In [92]:
#Select all category columns

df_category = df.select_dtypes(include = ["category"])

In [94]:
df_category.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 57 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   weeklyhrscomputer      5792 non-null   category
 3   weeklyhrstv            6711 non-null   category
 4   highestdegree          8953 non-null   category
 5   govprovidejobs         1833 non-null   category
 6   govpricecontrols       1859 non-null   category
 7   govhealthcare          1874 non-null   category
 8   govelderliving         1872 non-null   category
 9   govindhelp             1815 non-null   category
 10  govunemp               1811 non-null   category
 11  govincomediff          1775 non-null   category
 12  govcollegefinance      1875 non-null   category
 13  govdecenthousing       1847 non-null   category
 14  govprotectenvironment  1860 non-null  

In [96]:
# Select all numeric columns

df_numeric = df.select_dtypes(include = ["number"])

In [98]:
df_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   birthmonth             8984 non-null   int64  
 1   birthyear              8984 non-null   int64  
 2   highestgradecompleted  6663 non-null   float64
 3   childathome            4791 non-null   float64
 4   childnotathome         4791 non-null   float64
 5   wageincome             5091 non-null   float64
 6   nightlyhrssleep        6706 non-null   float64
 7   satverbal              1406 non-null   float64
 8   satmath                1407 non-null   float64
 9   gpaoverall             6004 non-null   float64
 10  gpaenglish             5798 non-null   float64
 11  gpamath                5766 non-null   float64
 12  gpascience             5684 non-null   float64
 13  weeksworked00          8603 non-null   float64
 14  weeksworked01          8564 non-null   float64
 15  we

## Select columns using "regex" pattern

In [102]:
df_income = df.filter(regex = "income")

In [104]:
df_income.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   wageincome     5091 non-null   float64 
 1   govincomediff  1775 non-null   category
dtypes: category(1), float64(1)
memory usage: 149.3 KB
