In [1]:
import numpy as np
import pandas as pd

### This notebook contains following topics:
- Working with text data
- Indexing and Selecting data
- Statistical functions

#### Working with text data:

In [2]:
s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
s

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object

In [4]:
# lower
s.str.lower()

0             tom
1    william rick
2            john
3         alber@t
4             NaN
5            1234
6      stevesmith
dtype: object

In [5]:
# upper
s.str.upper()

0             TOM
1    WILLIAM RICK
2            JOHN
3         ALBER@T
4             NaN
5            1234
6      STEVESMITH
dtype: object

In [6]:
# len
s.str.len()

0     3.0
1    12.0
2     4.0
3     7.0
4     NaN
5     4.0
6    10.0
dtype: float64

In [12]:
st = pd.Series(['  John      ', 'Cena  ', 'Dwayne Johnson     '])
print(st)

0             John      
1                 Cena  
2    Dwayne Johnson     
dtype: object


In [13]:
# strip
st.str.strip()

0              John
1              Cena
2    Dwayne Johnson
dtype: object

In [25]:
# split
sp = pd.Series(['John Cena Randy Orton'])
print(sp,'\n')
print(sp.str.split(' '))

0    John Cena Randy Orton
dtype: object 

0    [John, Cena, Randy, Orton]
dtype: object


In [28]:
# dummies
du = pd.Series(['John', 'Cena', 'Randy', 'Orton'])
print(du)
du.str.get_dummies()

0     John
1     Cena
2    Randy
3    Orton
dtype: object


Unnamed: 0,Cena,John,Orton,Randy
0,0,1,0,0
1,1,0,0,0
2,0,0,0,1
3,0,0,1,0


In [33]:
# contains
co = pd.Series(['Jeff', 'Hardy', 1, 'Matt'])
co.str.contains('Jeff')

0     True
1    False
2      NaN
3    False
dtype: object

In [38]:
# replace(a,b)

re = pd.Series(['Jeff', 'Hardy', 'Randy', 'Matt'])
print(re)
re.str.replace('Randy', 'Orton')

0     Jeff
1    Hardy
2    Randy
3     Matt
dtype: object


0     Jeff
1    Hardy
2    Orton
3     Matt
dtype: object

In [40]:
# repeat(value)

print(re)
re.str.repeat(2)

0     Jeff
1    Hardy
2    Randy
3     Matt
dtype: object


0      JeffJeff
1    HardyHardy
2    RandyRandy
3      MattMatt
dtype: object

In [45]:
# count(pattern)

print(re)
re.str.count('a')

0     Jeff
1    Hardy
2    Randy
3     Matt
dtype: object


0    0
1    1
2    1
3    1
dtype: int64

In [70]:
# find(pattern)

fi = pd.Series(['Prajwal', 'Nikhil', 'Samiksh1', 'Naveen', 200, '356'])
fi.str.find('l')        # -1 means there is no pattern

0    6.0
1    5.0
2   -1.0
3   -1.0
4    NaN
5   -1.0
dtype: float64

In [71]:
fi.str.findall('a')

0    [a, a]
1        []
2       [a]
3       [a]
4       NaN
5        []
dtype: object

In [72]:
fi.str.swapcase()

0     pRAJWAL
1      nIKHIL
2    sAMIKSH1
3      nAVEEN
4         NaN
5         356
dtype: object

In [73]:
print(fi.str.islower())
print(fi.str.isupper())

0    False
1    False
2    False
3    False
4      NaN
5    False
dtype: object
0    False
1    False
2    False
3    False
4      NaN
5    False
dtype: object


In [77]:
print(fi)
fi.str.isnumeric()

0     Prajwal
1      Nikhil
2    Samiksh1
3      Naveen
4         200
5         356
dtype: object


0    False
1    False
2    False
3    False
4      NaN
5     True
dtype: object

#### Indexing and Selecting data:

In [79]:
df = pd.DataFrame(np.random.rand(6,4), columns = ['A','B','C','D'], index = ['s','t','u','v','w','x'])
df

Unnamed: 0,A,B,C,D
s,0.057142,0.548238,0.621237,0.416992
t,0.564578,0.684469,0.640134,0.919305
u,0.491288,0.968421,0.322718,0.026373
v,0.150831,0.629901,0.025755,0.661979
w,0.941487,0.167416,0.424329,0.583579
x,0.96634,0.916059,0.556427,0.273504


In [96]:
# Slice with labels for row and single label for column. 
# Both the start and stop of the slice are included.
df.loc['s':'v','A':'D']

Unnamed: 0,A,B,C,D
s,0.057142,0.548238,0.621237,0.416992
t,0.564578,0.684469,0.640134,0.919305
u,0.491288,0.968421,0.322718,0.026373
v,0.150831,0.629901,0.025755,0.661979


In [97]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], index=['cobra', 'viper', 'sidewinder'], columns=['max_speed', 'shield'])
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [102]:
df.loc['viper']

max_speed    4
shield       5
Name: viper, dtype: int64

In [119]:
# List of labels. Note using ``[[]]`` returns a DataFrame.
df.loc[['viper', 'sidewinder']]

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [120]:
# Single label for row and column
df.loc['cobra', 'max_speed']

1

In [100]:
# Slice with labels for row and single label for column. 
# Both the start and stop of the slice are included.
df.loc[:,'shield']

cobra         2
viper         5
sidewinder    8
Name: shield, dtype: int64

In [104]:
df.loc['cobra', ['shield','max_speed']]

shield       2
max_speed    1
Name: cobra, dtype: int64

In [118]:
df.loc[[True, False, True]]

Unnamed: 0,max_speed
cobra,1
sidewinder,7


In [114]:
# Conditional that returns a boolean Series
df.loc[df['shield'] > 4]

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [117]:
# Conditional that returns a boolean Series with column 'labels specified'
df.loc[df['shield'] > 4, ['max_speed']]

Unnamed: 0,max_speed
viper,4
sidewinder,7


In [125]:
# Set value for all items matching the list of labels
df.loc[['viper', 'sidewinder'], ['shield']] = 50
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,50
sidewinder,7,50


In [126]:
# set value for an entire row
df.loc['cobra'] = 10
df

Unnamed: 0,max_speed,shield
cobra,10,10
viper,4,50
sidewinder,7,50


In [151]:
# set value for an entire column
df.loc[:, 'shield'] = 20
df

Unnamed: 0,max_speed,shield
cobra,21,20
viper,21,20
sidewinder,21,20


#### Getting values with a MultiIndex

In [152]:
tuples = [('cobra', 'mark i'), ('cobra', 'mark ii'),('sidewinder', 'mark i'), 
          ('sidewinder', 'mark ii'), ('viper', 'mark ii'), ('viper', 'mark iii')]
index = pd.MultiIndex.from_tuples(tuples)
values = [[12, 2], [0, 4], [10, 20],[1, 4], [7, 1], [16, 36]]

dfi = pd.DataFrame(data = values, index = index, columns = ['speed', 'shield'])
print(dfi)

                     speed  shield
cobra      mark i       12       2
           mark ii       0       4
sidewinder mark i       10      20
           mark ii       1       4
viper      mark ii       7       1
           mark iii     16      36


In [158]:
dfi.loc['cobra']

Unnamed: 0,speed,shield
mark i,12,2
mark ii,0,4


In [162]:
# Single index tuple. Note this returns a Series.
dfi.loc[('cobra', 'mark ii')]

speed     0
shield    4
Name: (cobra, mark ii), dtype: int64

In [163]:
dfi.loc['cobra', 'mark ii']

speed     0
shield    4
Name: (cobra, mark ii), dtype: int64

In [168]:
# Note using ``[[]]`` returns a DataFrame.
dfi.loc[['cobra', 'mark ii']]

Unnamed: 0,Unnamed: 1,speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4


In [169]:
# Single tuple. Note using ``[[]]`` returns a DataFrame.
dfi.loc[[('cobra', 'mark ii')]]

Unnamed: 0,Unnamed: 1,speed,shield
cobra,mark ii,0,4


In [170]:
dfi

Unnamed: 0,Unnamed: 1,speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1
viper,mark iii,16,36


In [171]:
dfi.loc[('cobra','mark i'), 'shield']

2

In [188]:
# Slice from index tuple to single label
dfi.loc[('cobra', 'mark i'):'viper']

Unnamed: 0,Unnamed: 1,speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1
viper,mark iii,16,36


In [187]:
dfi.loc[[('cobra', 'mark i'),('viper', 'mark ii')]]

Unnamed: 0,Unnamed: 1,speed,shield
cobra,mark i,12,2
viper,mark ii,7,1


In [184]:
# Slice from index tuple to index tuple
dfi.loc[('cobra', 'mark i'):('viper', 'mark ii')]

Unnamed: 0,Unnamed: 1,speed,shield
cobra,mark i,12,2
cobra,mark ii,0,4
sidewinder,mark i,10,20
sidewinder,mark ii,1,4
viper,mark ii,7,1


#### Statistical functions:

#### Covariance:
- Covariance is applied on series data. 
- The Series object has a method cov to compute covariance between series objects. 
- NA will be excluded automatically.

In [189]:
s1 = pd.Series([1,2,3,4,5])
s2 = pd.Series([9,8,7,6,5])

s1.cov(s2)

-2.5

In [190]:
s2.cov(s1)

-2.5

- Covariance method when applied on a DataFrame, computes cov between all the columns.

In [191]:
dfc = pd.DataFrame(np.random.rand(6,4), columns = ['a','b','c','d'])
print(dfc)

          a         b         c         d
0  0.937095  0.161951  0.676514  0.700197
1  0.388484  0.001992  0.024697  0.004857
2  0.452530  0.245915  0.266046  0.105397
3  0.792728  0.457318  0.628988  0.360220
4  0.846500  0.764930  0.404781  0.165060
5  0.056613  0.740780  0.731530  0.955025


In [196]:
dfc.cov()

Unnamed: 0,a,b,c,d
a,0.114129,-0.011616,0.012899,-0.02825
b,-0.011616,0.098257,0.046708,0.046367
c,0.012899,0.046708,0.075839,0.09012
d,-0.02825,0.046367,0.09012,0.139178


In [195]:
dfc['a'].cov(dfc['b'])

-0.01161598040541674

- Note − Observe the cov between a and b column in the first statement and the same is the value returned by cov on DataFrame.

#### Correlation:
- Correlation shows the linear relationship between any two array of values (series). 
- There are multiple methods to compute the correlation like pearson(default), spearman and kendall.

In [197]:
dfcor = pd.DataFrame(np.random.rand(8,4), columns = ['a','b','c','d'])
print(dfcor)

          a         b         c         d
0  0.949878  0.639988  0.283401  0.445739
1  0.551817  0.716791  0.569518  0.723031
2  0.566394  0.696154  0.423161  0.065198
3  0.977555  0.164934  0.217843  0.469011
4  0.688896  0.928307  0.259435  0.247632
5  0.764776  0.001382  0.849351  0.101302
6  0.332606  0.075167  0.236328  0.358994
7  0.887119  0.203340  0.533336  0.406299


In [198]:
dfcor.corr()

Unnamed: 0,a,b,c,d
a,1.0,-0.079754,0.015289,0.094696
b,-0.079754,1.0,-0.286164,0.112301
c,0.015289,-0.286164,1.0,-0.206568
d,0.094696,0.112301,-0.206568,1.0


In [199]:
dfcor['a'].corr(dfcor['b'])

-0.079754482487515

- If any non-numeric column is present in the DataFrame, it is excluded automatically.

#### Data Ranking:
- Data Ranking produces ranking for each element in the array of elements. In case of ties, assigns the mean rank.

In [217]:
df = pd.DataFrame(data = {'Animal': ['cat', 'penguin', 'dog', 'spider', 'snake'],'Number_legs': [4, 2, 4, 8, np.nan]})
print(df)

    Animal  Number_legs
0      cat          4.0
1  penguin          2.0
2      dog          4.0
3   spider          8.0
4    snake          NaN


- Rank optionally takes a parameter ascending which by default is true; when false, data is reverse-ranked, with larger values assigned a smaller rank.

In [237]:
df['default_rank'] = df['Number_legs'].rank()
df['max_rank'] = df['Number_legs'].rank(method='max')
df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
df['pct_rank'] = df['Number_legs'].rank(pct=True)
print(df)

    Animal  Number_legs  default_rank  max_rank  NA_bottom  pct_rank
0      cat          4.0           2.5       3.0        2.5     0.625
1  penguin          2.0           1.0       1.0        1.0     0.250
2      dog          4.0           2.5       3.0        2.5     0.625
3   spider          8.0           4.0       4.0        4.0     1.000
4    snake          NaN           NaN       NaN        5.0       NaN


In [243]:
print(df['Number_legs'].rank(method = 'min'))
print(df['Number_legs'].rank(method = 'dense'))

0    2.0
1    1.0
2    2.0
3    4.0
4    NaN
Name: Number_legs, dtype: float64
0    2.0
1    1.0
2    2.0
3    3.0
4    NaN
Name: Number_legs, dtype: float64
