# Working with Data

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

## Duplicates in DataFrames

In [3]:
dframe = DataFrame({'key1':['A']*2+['B']*3,'key2':[2,2,2,3,3]})
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [4]:
# We can use duplicated to find duplicates
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [5]:
# We can also drop duplicates like this:
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [6]:
# You can filter which duplicates to drop by a single column
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [14]:
# By default the first value was taken for the duplicates, we can also take the last value instead
dframe.drop_duplicates(['key1'],keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


## Mapping

* Add columns to DataFrame

In [15]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

In [16]:
# Let's create a dframe to work with (Highest elevation cities in USA)
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

#Show
dframe

Unnamed: 0,city,altitude
0,Alma,3158
1,Brian Head,3000
2,Fox Park,2762


In [17]:
# Now let's say we wanted to add a column for the States, we can do that with a mapping.
state_map={'Alma':'Colorado','Brian Head':'Utah','Fox Park':'Wyoming'}

In [19]:
# Now we can map that data to our current dframe
dframe['state'] = dframe['city'].map(state_map)
dframe

Unnamed: 0,city,altitude,state
0,Alma,3158,Colorado
1,Brian Head,3000,Utah
2,Fox Park,2762,Wyoming


In [20]:
# Mapping is a great way to do element-wise transfomations and other data cleaning operations!

## Replace

In [21]:
import numpy as np
import pandas as pd 
from pandas import Series, DataFrame

In [22]:
# Lets make  Series
ser1 = Series([1,2,3,4,1,2,3,4])
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [23]:
# Using replace we can select --> .replace(value to be replaced, new_value)
ser1.replace(1,np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [24]:
# Can also input lists
ser1.replace([1,4],[100,400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [25]:
# Can also input dictionary
ser1.replace({4:np.nan})

0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64