## Import libraries

In [39]:
import pandas as pd

## Read the data

In [42]:
df = pd.read_csv("raw_data/nls97.csv")

## Set index

In [45]:
df.set_index("personid", inplace = True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8984 non-null   object 
 1   birthmonth             8984 non-null   int64  
 2   birthyear              8984 non-null   int64  
 3   highestgradecompleted  6663 non-null   float64
 4   maritalstatus          6672 non-null   object 
 5   childathome            4791 non-null   float64
 6   childnotathome         4791 non-null   float64
 7   wageincome             5091 non-null   float64
 8   weeklyhrscomputer      5792 non-null   object 
 9   weeklyhrstv            6711 non-null   object 
 10  nightlyhrssleep        6706 non-null   float64
 11  satverbal              1406 non-null   float64
 12  satmath                1407 non-null   float64
 13  gpaoverall             6004 non-null   float64
 14  gpaenglish             5798 non-null   float64
 15  gp

## Convert datatype from Object to category

In [54]:
for col in df.select_dtypes(['object']).columns:
    df[col] = df[col].astype('category')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   birthmonth             8984 non-null   int64   
 2   birthyear              8984 non-null   int64   
 3   highestgradecompleted  6663 non-null   float64 
 4   maritalstatus          6672 non-null   category
 5   childathome            4791 non-null   float64 
 6   childnotathome         4791 non-null   float64 
 7   wageincome             5091 non-null   float64 
 8   weeklyhrscomputer      5792 non-null   category
 9   weeklyhrstv            6711 non-null   category
 10  nightlyhrssleep        6706 non-null   float64 
 11  satverbal              1406 non-null   float64 
 12  satmath                1407 non-null   float64 
 13  gpaoverall             6004 non-null   float64 
 14  gpaenglish             5798 non-null  

## Display the columns with datatype as category

In [61]:
cat_cols = df.select_dtypes(include = ["category"]).columns


In [63]:
df[cat_cols].isnull().sum()

gender                      0
maritalstatus            2312
weeklyhrscomputer        3192
weeklyhrstv              2273
highestdegree              31
govprovidejobs           7151
govpricecontrols         7125
govhealthcare            7110
govelderliving           7112
govindhelp               7169
govunemp                 7173
govincomediff            7209
govcollegefinance        7109
govdecenthousing         7137
govprotectenvironment    7124
colenrfeb97              7734
colenroct97               483
colenrfeb98               483
colenroct98                96
colenrfeb99               119
colenroct99               133
colenrfeb00               164
colenroct00               179
colenrfeb01               198
colenroct01               226
colenrfeb02               252
colenroct02               286
colenrfeb03               326
colenroct03               362
colenrfeb04               406
colenroct04               438
colenrfeb05               476
colenroct05               513
colenrfeb0

## Show the frequencies for marital status column

In [66]:
df.maritalstatus.value_counts()

maritalstatus
Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: count, dtype: int64

## Show the percentages instead of counts

In [68]:
df.maritalstatus.value_counts(normalize = True, sort = False)

maritalstatus
Divorced         0.099371
Married          0.459532
Never-married    0.414568
Separated        0.023082
Widowed          0.003447
Name: proportion, dtype: float64

## Show the percentages of all government responsibility columns

In [72]:
df.filter(like = "gov").apply(pd.value_counts, normalize = True)

  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)
  df.filter(like = "gov").apply(pd.value_counts, normalize = True)


Unnamed: 0,govprovidejobs,govpricecontrols,govhealthcare,govelderliving,govindhelp,govunemp,govincomediff,govcollegefinance,govdecenthousing,govprotectenvironment
1. Definitely,0.247681,0.541689,0.665422,0.700321,0.42865,0.218112,0.324507,0.7344,0.442339,0.668817
2. Probably,0.336607,0.334051,0.271078,0.247863,0.411019,0.403092,0.284507,0.2304,0.433676,0.286559
3. Probably not,0.252046,0.086606,0.045358,0.037927,0.119008,0.262838,0.228732,0.026667,0.100162,0.02957
4. Definitely not,0.163666,0.037655,0.018143,0.013889,0.041322,0.115958,0.162254,0.008533,0.023822,0.015054


## Find percentages for all government responsibility columns of people who are married

In [79]:
df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)

  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)
  df[df.maritalstatus == 'Married'].filter(like = 'gov').apply(pd.value_counts, normalize = True)


Unnamed: 0,govprovidejobs,govpricecontrols,govhealthcare,govelderliving,govindhelp,govunemp,govincomediff,govcollegefinance,govdecenthousing,govprotectenvironment
1. Definitely,0.173315,0.459864,0.56445,0.632293,0.372549,0.147265,0.25889,0.695418,0.356354,0.644505
2. Probably,0.328748,0.380952,0.359566,0.310719,0.445378,0.395512,0.273115,0.261456,0.493094,0.313433
3. Probably not,0.314993,0.112925,0.05156,0.037992,0.142857,0.328191,0.284495,0.033693,0.120166,0.028494
4. Definitely not,0.182944,0.046259,0.024423,0.018996,0.039216,0.129032,0.183499,0.009434,0.030387,0.013569


## Find the frequencies and percentages for all category columns in the dataframe

In [85]:
freq_out = open('views/frequencies.txt', 'w')

for col in df.select_dtypes(include = ['category']):
    print (col, "---------------", 
           "frequencies",df[col].value_counts(sort= False), 
           "percentages", 
           df[col].value_counts(normalize = True, sort = False),
           sep = "\n\n",
           end = "\n\n\n",
           file = freq_out)

freq_out.close()