## Import libraries

In [2]:
import pandas as pd
import numpy as np


## Read the data

In [5]:
df = pd.read_csv("raw_data/nls97b.csv")

In [7]:
df.set_index("personid", inplace=True)

In [9]:
df.columns

Index(['gender', 'birthmonth', 'birthyear', 'highestgradecompleted',
       'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
       'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep', 'satverbal',
       'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience',
       'highestdegree', 'govprovidejobs', 'govpricecontrols', 'govhealthcare',
       'govelderliving', 'govindhelp', 'govunemp', 'govincomediff',
       'govcollegefinance', 'govdecenthousing', 'govprotectenvironment',
       'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
       'weeksworked04', 'weeksworked05', 'weeksworked06', 'weeksworked07',
       'weeksworked08', 'weeksworked09', 'weeksworked10', 'weeksworked11',
       'weeksworked12', 'weeksworked13', 'weeksworked14', 'weeksworked15',
       'weeksworked16', 'weeksworked17', 'colenrfeb97', 'colenroct97',
       'colenrfeb98', 'colenroct98', 'colenrfeb99', 'colenroct99',
       'colenrfeb00', 'colenroct00', 'colenrfeb01', 'col

## Descriptive statistics of gpaoverall column

In [12]:
gpa_overall = df.gpaoverall

In [14]:
type(gpa_overall)

pandas.core.series.Series

In [20]:
gpa_overall.describe().round(2)

count    6004.00
mean        2.82
std         0.62
min         0.10
25%         2.43
50%         2.86
75%         3.26
max         4.17
Name: gpaoverall, dtype: float64

## Getting quantiles 

In [23]:
gpa_overall.quantile(np.arange(0.1,1.1,0.1))

0.1    2.02
0.2    2.31
0.3    2.52
0.4    2.70
0.5    2.86
0.6    3.01
0.7    3.17
0.8    3.36
0.9    3.60
1.0    4.17
Name: gpaoverall, dtype: float64

## Show descriptives for a subset of series

### Filtering the data

In [26]:
gpa_overall.loc[gpa_overall.between(3,3.5)].head(5)

personid
100061    3.06
100292    3.45
101526    3.37
101527    3.26
102125    3.14
Name: gpaoverall, dtype: float64

### Getting the sum

In [31]:
gpa_overall.loc[gpa_overall.between(3,3.5)].sum()

5416.26

### Getting the count

In [34]:
gpa_overall.loc[gpa_overall.between(3,3.5)].count()

1679

### GPA overall less than 2 or greater than 4

In [38]:
gpa_overall.loc[(gpa_overall<2) | (gpa_overall>4)].sample(5, random_state =2)

personid
932782    1.90
561335    1.82
850001    4.10
292455    1.97
644271    1.97
Name: gpaoverall, dtype: float64

## Gettting the stats of GPA overall greater than 99th quantile

In [41]:
gpa_overall.loc[gpa_overall>gpa_overall.quantile(0.99)].agg(['count','min','max','sum','mean'])

count     60.000000
min        3.980000
max        4.170000
sum      240.560000
mean       4.009333
Name: gpaoverall, dtype: float64

## Testing conditions accross all values

### Check if any GPA values are greater than 4

In [47]:
(gpa_overall>4).any()

True

### Check whether all GPA values are above or equal to zero

In [50]:
(gpa_overall >=0).all()

False

### Number of people with GPA >0

In [53]:
(gpa_overall >=0).sum()

6004

### Number of people with GPA = 0

In [56]:
(gpa_overall == 0).sum()

0

### Number of people with missing values in GPA

In [58]:
(gpa_overall.isnull()).sum()

2980

## Show stats for a subset of the series based on values in a different column

In [63]:
df.loc[df.wageincome > df.wageincome.quantile(0.75), 'gpaoverall'].mean()

3.080417101147028

In [65]:
df.loc[df.wageincome < df.wageincome.quantile(0.25), 'gpaoverall'].mean()

2.7201434159061284

## Show descriptives and frequencies for a series containing categorical variable

In [68]:
df.maritalstatus.describe()

count        6672
unique          5
top       Married
freq         3066
Name: maritalstatus, dtype: object

### Frequencies

In [71]:
df.maritalstatus.value_counts()

maritalstatus
Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: count, dtype: int64