In [1]:
import numpy as np
import pandas as pd

### Dealing with empty spaces

In [49]:
df = pd.read_csv(r'./data/user_data.csv')
df

Unnamed: 0,Unnamed: 1,Name,Age,Doj,Address
""" Rama Nigam""",23,"""01-07-2020""",""" A1",Janpath,"New Delhi-01"""
""" Alkesh Kumar""",27,"""01-07-2020""",""" K10",Shivar Chowk,"Pune-27 """
""" Atul Tiwari """,30,"""03-Jul-2020""",""" L20",Kalkaji,"Lucknow -21"""
Vijay Kumar,29,"""04-Jul-2020""",""" """,,


In [50]:
df = pd.read_csv(r'./data/user_data.csv', skipinitialspace=True)
df

Unnamed: 0,Name,Age,Doj,Address
0,Rama Nigam,23,01-07-2020,"A1, Janpath, New Delhi-01"
1,Alkesh Kumar,27,01-07-2020,"K10, Shivar Chowk, Pune-27"
2,Atul Tiwari,30,03-Jul-2020,"L20, Kalkaji, Lucknow -21"
3,Vijay Kumar,29,04-Jul-2020,


In [51]:
df['Name'][0] == "Rama Nigam"

False

In [52]:
df['Name'][0] == " Rama Nigam "

True

In [53]:
df['Name'].apply(lambda x : len(x))

0    12
1    13
2    13
3    12
Name: Name, dtype: int64

In [54]:
# remove front and ending space
df.replace({"^\s*|\s*$": ""}, regex = True, inplace = True)

# empty strings should be representated as nan
df.replace({"": np.nan}, inplace = True)

In [55]:
df['Name'].apply(lambda x : len(x))

0    10
1    12
2    11
3    11
Name: Name, dtype: int64

In [56]:
df

Unnamed: 0,Name,Age,Doj,Address
0,Rama Nigam,23,01-07-2020,"A1, Janpath, New Delhi-01"
1,Alkesh Kumar,27,01-07-2020,"K10, Shivar Chowk, Pune-27"
2,Atul Tiwari,30,03-Jul-2020,"L20, Kalkaji, Lucknow -21"
3,Vijay Kumar,29,04-Jul-2020,


In [57]:
df["Name"][0] == "Rama Nigam"

True

### Strip

In [61]:
df = pd.read_csv(r'./data/user_data.csv', skipinitialspace=True, parse_dates=['Doj'])
df

Unnamed: 0,Name,Age,Doj,Address
0,Rama Nigam,23,2020-01-07,"A1, Janpath, New Delhi-01"
1,Alkesh Kumar,27,2020-01-07,"K10, Shivar Chowk, Pune-27"
2,Atul Tiwari,30,2020-07-03,"L20, Kalkaji, Lucknow -21"
3,Vijay Kumar,29,2020-07-04,


In [62]:
df.dtypes

Name               object
Age                 int64
Doj        datetime64[ns]
Address            object
dtype: object

In [66]:
for col in df.columns:
    if pd.api.types.is_string_dtype(df[col]):
        print(f"Working on column: {col}")
        df[col] = df[col].str.strip()

df.replace({"": np.nan}, inplace = True)

Working on column: Name
Working on column: Address


In [67]:
df

Unnamed: 0,Name,Age,Doj,Address
0,Rama Nigam,23,2020-01-07,"A1, Janpath, New Delhi-01"
1,Alkesh Kumar,27,2020-01-07,"K10, Shivar Chowk, Pune-27"
2,Atul Tiwari,30,2020-07-03,"L20, Kalkaji, Lucknow -21"
3,Vijay Kumar,29,2020-07-04,


In [69]:
df['Name'][0] == "Rama Nigam"

True

### Dealing with ids of specific format

In [130]:
data = pd.read_csv(r'./data/user_ids.csv')

In [131]:
data.head()

Unnamed: 0,PassportNumber
0,B5649076
1,0022730
2,O528485
3,3150943
4,L1114025


In [132]:
# find no. of duplicates
len(data) - data.nunique() 

PassportNumber    7
dtype: int64

In [133]:
# remove duplicates
data.drop_duplicates(inplace = True)

In [134]:
# after removing dups, check length is equal to no. of unique values 
len(data) == data.nunique()

PassportNumber    True
dtype: bool

In [135]:
# find min , max length of passport number 
data["PassportNumber"].apply(len).agg(["min", "max"])

min     3
max    17
Name: PassportNumber, dtype: int64

In [136]:
data["Passport_len"] = data["PassportNumber"].apply(len)

In [142]:
# passportnumbers where length is 9
sum(data['Passport_len'] == 9)

2552

In [140]:
# display passportnumbers where length was minimum
data[data.Passport_len == data.Passport_len.min()]

Unnamed: 0,PassportNumber,Passport_len
893,179,3
10418,917,3
10557,237,3


In [141]:
# display passportnumbers where length was maximum
data[data.Passport_len == data.Passport_len.max()]

Unnamed: 0,PassportNumber,Passport_len
5688,65361100000000000,17
5910,39598000000000000,17


In [143]:
data.groupby('Passport_len').count()

Unnamed: 0_level_0,PassportNumber
Passport_len,Unnamed: 1_level_1
3,3
4,21
5,80
6,305
7,4519
8,2709
9,2552
10,144
11,66
12,486


In [144]:
# leading zeros count: 
data["leading_zeroes"] = data["PassportNumber"].apply(lambda x : len(x) - len(x.lstrip("0")))

In [145]:
data.head()

Unnamed: 0,PassportNumber,Passport_len,leading_zeroes
0,B5649076,8,0
1,0022730,7,2
2,O528485,7,0
3,3150943,7,0
4,L1114025,8,0


In [146]:
# passportnum of length 9 and no leading zero 
sum((data.Passport_len == 9) & (data.leading_zeroes == 0))

2351

In [147]:
import re

In [148]:
data["starts_with_letter"] = data["PassportNumber"].apply(lambda x : True if re.match("^[a-zA-Z]+", x) else False)

In [149]:
data.head()

Unnamed: 0,PassportNumber,Passport_len,leading_zeroes,starts_with_letter
0,B5649076,8,0,True
1,0022730,7,2,False
2,O528485,7,0,True
3,3150943,7,0,False
4,L1114025,8,0,True


In [150]:
valid_indx = data[(data.starts_with_letter) & (data.Passport_len == 9)].index

data.loc[valid_indx]

Unnamed: 0,PassportNumber,Passport_len,leading_zeroes,starts_with_letter
17,G67C34857,9,0,True
28,MU2943007,9,0,True
29,O18073717,9,0,True
30,SE1801226,9,0,True
31,MM8260002,9,0,True
...,...,...,...,...
10880,J35890323,9,0,True
10881,X81731896,9,0,True
10889,IWZR25390,9,0,True
10894,W03M99141,9,0,True
