## Import libraries


In [3]:
import pandas as pd
import numpy as np

## Read the data

In [5]:
df = pd.read_csv("raw_data/nls97b.csv")

In [7]:
df.set_index("personid", inplace=True)

In [10]:
df.columns

Index(['gender', 'birthmonth', 'birthyear', 'highestgradecompleted',
       'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
       'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep', 'satverbal',
       'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience',
       'highestdegree', 'govprovidejobs', 'govpricecontrols', 'govhealthcare',
       'govelderliving', 'govindhelp', 'govunemp', 'govincomediff',
       'govcollegefinance', 'govdecenthousing', 'govprotectenvironment',
       'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
       'weeksworked04', 'weeksworked05', 'weeksworked06', 'weeksworked07',
       'weeksworked08', 'weeksworked09', 'weeksworked10', 'weeksworked11',
       'weeksworked12', 'weeksworked13', 'weeksworked14', 'weeksworked15',
       'weeksworked16', 'weeksworked17', 'colenrfeb97', 'colenroct97',
       'colenrfeb98', 'colenroct98', 'colenrfeb99', 'colenroct99',
       'colenrfeb00', 'colenroct00', 'colenrfeb01', 'col

## Edit all values based on a scalar

### Multiply overall gpa by 100

In [14]:
df.gpaoverall.head()

personid
100061    3.06
100139     NaN
100284     NaN
100292    3.45
100583    2.91
Name: gpaoverall, dtype: float64

In [16]:
gpaoverall_100 = df['gpaoverall']*100

In [20]:
gpaoverall_100.head()

personid
100061    306.0
100139      NaN
100284      NaN
100292    345.0
100583    291.0
Name: gpaoverall, dtype: float64

## Setting values using index labels

In [23]:
df.loc[[100061], "gpaoverall"] = 3

df.loc[[100139,100284,100292], 'gpaoverall'] = 0

df.gpaoverall.head()

personid
100061    3.00
100139    0.00
100284    0.00
100292    0.00
100583    2.91
Name: gpaoverall, dtype: float64

## Set values using an operator on more than one series

### Using + operator to calculate total number of children from 2 columns - childathome and childnotathome

In [29]:
df.maritalstatus.value_counts()

maritalstatus
Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: count, dtype: int64

In [33]:
df['totalchildren'] = df.childathome = df.childnotathome

df.totalchildren.value_counts().sort_index()

totalchildren
0.0     3711
1.0      575
2.0      292
3.0      119
4.0       53
5.0       25
6.0        8
7.0        4
8.0        3
12.0       1
Name: count, dtype: int64

### Checking the data of children for married people

In [36]:
df.loc[df.maritalstatus == 'Married',['childathome', 'childnotathome', 'totalchildren']]

Unnamed: 0_level_0,childathome,childnotathome,totalchildren
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,0.0,0.0,0.0
100139,0.0,0.0,0.0
100583,0.0,0.0,0.0
100833,0.0,0.0,0.0
101288,0.0,0.0,0.0
...,...,...,...
998472,,,
998556,0.0,0.0,0.0
998997,5.0,5.0,5.0
999291,,,


## Set the values of some values for the mean 

In [42]:
df.loc[100061:100292, 'gpaoverall'] = df.gpaoverall.mean().round(2)

df.gpaoverall.head()

personid
100061    2.82
100139    2.82
100284    2.82
100292    2.82
100583    2.91
Name: gpaoverall, dtype: float64

## Set the values using position

In [46]:
df.iloc[0,13] =2

df.iloc[1:4 , 13] = 1

df.gpaoverall.head()

personid
100061    2.00
100139    1.00
100284    1.00
100292    1.00
100583    2.91
Name: gpaoverall, dtype: float64

## Set the gpa values after filtering

### Change all the gpa values over 4 to 4

In [52]:
df.gpaoverall.nlargest()

personid
312410    4.17
639701    4.11
850001    4.10
279096    4.08
620216    4.07
Name: gpaoverall, dtype: float64

In [54]:
df.loc[df.gpaoverall >4, 'gpaoverall'] = 4

In [56]:
df.gpaoverall.nlargest()

personid
112756    4.0
119784    4.0
160193    4.0
250666    4.0
271961    4.0
Name: gpaoverall, dtype: float64