# 're’ library is used for regular expressions
---------------
* A formal way of specifying a pattern
* It is a back bone of NLP work
* For more examples and theory refer [w3schools](https://www.w3schools.com/python/python_regex.asp)
* [For more refer python help document](https://docs.python.org/3/library/re.html)

## 1. Exercise 1: Regular expressions

In [1]:
# 01. Import the regular expression module
import re
# 02. Example 1: match Phone Number
# 02 - 1. Define required pattern
pattern = re.compile('\d{3}-\d{3}-\d{4}')

In [2]:
pattern

re.compile(r'\d{3}-\d{3}-\d{4}', re.UNICODE)

In [3]:
# 02 - 2 Observe result
rs = pattern.match('123-456-7890')
bool(rs)

True

In [4]:
rs = pattern.match('1234-456-7890')
bool(rs)

False

In [2]:
# 03. Example 2: verify dollar amount format
# 03 - 1. Define required pattern
pattern = re.compile('\$\d*.\d{2}')
# 03 - 2 Observe result
rs = pattern.match('$13.25')

In [3]:
rs

<_sre.SRE_Match object; span=(0, 6), match='$13.25'>

In [4]:
bool(rs)

True

In [5]:
rs = pattern.match('13.25')
bool(rs)

False

## 2. Exercise 2: Create a list by reading numbers from string 

In [4]:
import re
# 02. Find the numeric values and store it as list : rs
rs = re.findall('\d+', 'The recipe calls for 10 strawberries and 1 banana')
# 03. Print the matches
type(rs)
rs

['10', '1']

In [11]:
# 04. convert list items into integers 
rs = [int(x) for x in rs]
rs

[10, 1]

## 3. Dropping duplicate data
- .drop_duplicates() method is useful to drop duplicates from dataframe 


In [1]:
# 01. Import required module
import pandas as pd
# 02 . Load csv file as dataframe
df = pd.read_csv('C:\\Users\\ramreddymyla\\Google Drive\\01 DS ML DL NLP and AI With Python Lab Copy\\02 Lab Data\\python\\emp_duplicate.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 8 columns):
EMPNO       17 non-null int64
ENAME       17 non-null object
JOB         17 non-null object
MGR         16 non-null float64
HIREDATE    17 non-null object
SAL         17 non-null int64
COMM        4 non-null float64
DEPTNO      17 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 1.2+ KB


In [2]:
df.tail(6)

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
11,7900,JAMES,CLERK,7698.0,03-DEC-81,950,,30
12,7902,FORD,ANALYST,7566.0,03-DEC-81,3000,,20
13,7934,MILLER,CLERK,7782.0,23-JAN-82,1300,,10
14,7900,JAMES,CLERK,7698.0,03-DEC-81,950,,30
15,7902,FORD,ANALYST,7566.0,03-DEC-81,3000,,20
16,7934,MILLER,CLERK,7782.0,23-JAN-82,1300,,10


In [3]:
# 03. Drop the duplicates: df_no_duplicates
df_no_duplicates = df.drop_duplicates()
df_no_duplicates.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 0 to 13
Data columns (total 8 columns):
EMPNO       14 non-null int64
ENAME       14 non-null object
JOB         14 non-null object
MGR         13 non-null float64
HIREDATE    14 non-null object
SAL         14 non-null int64
COMM        4 non-null float64
DEPTNO      14 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 1008.0+ bytes


In [4]:
df_no_duplicates

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7693,SMITH,CLERK,7902.0,17-DEC-80,960,,20
1,7499,ALLEN,SALESMAN,7698.0,20-FEB-81,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,02-APR-81,2975,,20
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,01-MAY-81,2850,,30
6,7782,CLARK,MANAGER,7839.0,09-JUN-81,2450,,10
7,7788,SCOTT,ANALYST,7566.0,19-APR-87,3000,,20
8,7839,KING,PRESIDENT,,17-NOV-81,5000,,10
9,7844,TURNER,SALESMAN,7698.0,08-SEP-81,1500,0.0,30


## 4. Filling missing data
- .fillna() method -->  Fill with business specified value.
- .dropna() method -->  to drop rows 


In [16]:
# 01. import required module 
import pandas as pd
# 02. load csv as dataframe 
df = pd.read_csv('C:\\Users\\Hi\\Google Drive\\01 DS ML DL NLP and AI With Python Lab Copy\\02 Lab Data\\Python\\emp_missing.csv')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 8 columns):
EMPNO       14 non-null int64
ENAME       14 non-null object
JOB         13 non-null object
MGR         13 non-null float64
HIREDATE    13 non-null object
SAL         14 non-null int64
COMM        4 non-null float64
DEPTNO      14 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 976.0+ bytes


In [17]:
df

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7693,SMITH,CLERK,7902.0,17-DEC-80,960,,20
1,7499,ALLEN,,7698.0,20-FEB-81,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,02-APR-81,2975,,20
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,01-MAY-81,2850,,30
6,7782,CLARK,MANAGER,7839.0,09-JUN-81,2450,,10
7,7788,SCOTT,ANALYST,7566.0,,3000,,20
8,7839,KING,PRESIDENT,,17-NOV-81,5000,,10
9,7844,TURNER,SALESMAN,7698.0,08-SEP-81,1500,0.0,30


In [18]:
# 03.Calculate the mean of the comm column : comm_mean
COMM_mean = round(df.COMM.mean(),1)

In [19]:
COMM_mean

550.0

In [20]:
# 04. Replace all the missing values in the comm column with the mean
df.COMM = df.COMM.fillna(COMM_mean)

In [21]:
df.COMM

0      550.0
1      300.0
2      500.0
3      550.0
4     1400.0
5      550.0
6      550.0
7      550.0
8      550.0
9        0.0
10     550.0
11     550.0
12     550.0
13     550.0
Name: COMM, dtype: float64

In [22]:
# 05. Observe result
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 8 columns):
EMPNO       14 non-null int64
ENAME       14 non-null object
JOB         13 non-null object
MGR         13 non-null float64
HIREDATE    13 non-null object
SAL         14 non-null int64
COMM        14 non-null float64
DEPTNO      14 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 976.0+ bytes


In [23]:
df

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7693,SMITH,CLERK,7902.0,17-DEC-80,960,550.0,20
1,7499,ALLEN,,7698.0,20-FEB-81,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,02-APR-81,2975,550.0,20
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,01-MAY-81,2850,550.0,30
6,7782,CLARK,MANAGER,7839.0,09-JUN-81,2450,550.0,10
7,7788,SCOTT,ANALYST,7566.0,,3000,550.0,20
8,7839,KING,PRESIDENT,,17-NOV-81,5000,550.0,10
9,7844,TURNER,SALESMAN,7698.0,08-SEP-81,1500,0.0,30


In [24]:
# 06. drop missing values rows
df_dropped = df.dropna(axis = 0) # use axis = 1 to drop columns
df_dropped.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 13
Data columns (total 8 columns):
EMPNO       11 non-null int64
ENAME       11 non-null object
JOB         11 non-null object
MGR         11 non-null float64
HIREDATE    11 non-null object
SAL         11 non-null int64
COMM        11 non-null float64
DEPTNO      11 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 792.0+ bytes


In [25]:
df_dropped

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7693,SMITH,CLERK,7902.0,17-DEC-80,960,550.0,20
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,02-APR-81,2975,550.0,20
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,01-MAY-81,2850,550.0,30
6,7782,CLARK,MANAGER,7839.0,09-JUN-81,2450,550.0,10
9,7844,TURNER,SALESMAN,7698.0,08-SEP-81,1500,0.0,30
10,7876,ADAMS,CLERK,7788.0,23-MAY-87,1100,550.0,20
11,7900,JAMES,CLERK,7698.0,03-DEC-81,950,550.0,30
12,7902,FORD,ANALYST,7566.0,03-DEC-81,3000,550.0,20


In [27]:
# 01. import required module 
import pandas as pd
# 02. load csv as dataframe 
df = pd.read_csv('C:\\Users\\Hi\\Google Drive\\01 DS ML DL NLP and AI With Python Lab Copy\\02 Lab Data\\Python\\emp_missing.csv')


In [28]:
df

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7693,SMITH,CLERK,7902.0,17-DEC-80,960,,20
1,7499,ALLEN,,7698.0,20-FEB-81,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,02-APR-81,2975,,20
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,01-MAY-81,2850,,30
6,7782,CLARK,MANAGER,7839.0,09-JUN-81,2450,,10
7,7788,SCOTT,ANALYST,7566.0,,3000,,20
8,7839,KING,PRESIDENT,,17-NOV-81,5000,,10
9,7844,TURNER,SALESMAN,7698.0,08-SEP-81,1500,0.0,30


In [29]:
# 03. drop missing values rows
df_dropped = df.dropna(axis = 0) # use axis = 1 to drop columns
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 2 to 9
Data columns (total 8 columns):
EMPNO       3 non-null int64
ENAME       3 non-null object
JOB         3 non-null object
MGR         3 non-null float64
HIREDATE    3 non-null object
SAL         3 non-null int64
COMM        3 non-null float64
DEPTNO      3 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 216.0+ bytes


In [30]:
df_dropped

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
2,7521,WARD,SALESMAN,7698.0,22-FEB-81,1250,500.0,30
4,7654,MARTIN,SALESMAN,7698.0,28-SEP-81,1250,1400.0,30
9,7844,TURNER,SALESMAN,7698.0,08-SEP-81,1500,0.0,30


In [31]:
# 04. Assert that there are no missing values
assert pd.notnull(df_dropped).all().all()


In [2]:
# 05. Assert that all values are >= 0
assert (df_dropped.COMM >= 0).all().all()


NameError: name 'df_dropped' is not defined