Cleaning a list of customers with phone numbers for a call centre

In [1]:
import pandas as pd
import numpy as np

In [2]:
# reading the sample file

df = pd.read_excel('Customer Call List.xlsx')

In [3]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


In [4]:
# Checking for null values
df.isnull().sum()

CustomerID           0
First_Name           0
Last_Name            1
Phone_Number         2
Address              0
Paying Customer      0
Do_Not_Contact       4
Not_Useful_Column    0
dtype: int64

In [5]:
# Checking for duplicate rows
df[df.duplicated()]

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
20,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True


In [6]:
# Removing duplicate rows
df = df.drop_duplicates()

In [7]:
df[df.duplicated()]

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column


While trying to apply .strip() method to column 'Last_Name', the following warning was encountered. To avoid such a warning in the future,
run the code in the next cell

In [31]:

df['Last_Name'] = df['Last_Name'].str.strip('...')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Last_Name'] = df['Last_Name'].str.strip('...')


In [8]:
pd.options.mode.chained_assignment = None

In [9]:
# Applying .strip() method to remove all special characters from the column 'Last_Name'
df['Last_Name'] = df['Last_Name'].str.strip('.../_')

In [10]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


In [11]:
# Removing unwanted Columns

df = df.drop('Not_Useful_Column',axis=1)

In [12]:
df.head(5)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No


In [None]:
Phone Numbers are the most important data, It will be cleaned as follows:
-- By removing rows with missing phone number
-- By removing special characters from phone numbers
-- By changing the format of the phone numbers

In [70]:
# Deleted rows with no phone numbers

df= df.dropna(subset = 'Phone_Number')


In [13]:
# removing special characters from phone numbers

df["Phone_Number"]=df["Phone_Number"].str.replace('[^a-zA-Z0-9]' , '' , regex=True)
# df["Phone_Number"]

In [14]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,1235455421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,1236439775,93 West Main Street,No,Yes
2,1003,Walter,White,,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,1235432345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,8766783469,123 Dragons Road,Y,No
5,1006,Ron,Swanson,3047622467,768 City Parkway,Yes,Yes
6,1007,Jeff,Winger,,1209 South Street,No,No
7,1008,Sherlock,Holmes,8766783469,98 Clue Drive,N,No
8,1009,Gandalf,,Na,123 Middle Earth,Yes,
9,1010,Peter,Parker,1235455421,"25th Main Street, New York",Yes,No


In [15]:
df['Phone_Number']=df['Phone_Number'].replace('Na','')

In [16]:
# Replacing empty cells with NaN and removing them
df['Phone_Number']=df['Phone_Number'].replace('', np.nan)
df = df.dropna(subset = 'Phone_Number')

In [17]:
# changing the format of the phone numbers

def new_num(phone):
    str_num = str(phone)
    str_num = str_num[0:3]+'-'+str_num[3:6]+'-'+str_num[6:10]
    return(str_num)
    

In [18]:
df['Phone_Number']=df['Phone_Number'].apply(new_num)

In [19]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Y,No
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,N,No
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No
12,1013,Don,Draper,123-543-2345,2039 Main Street,Yes,N
13,1014,Leslie,Knope,876-678-3469,343 City Parkway,Yes,No
14,1015,Toby,Flenderson,304-762-2467,214 HR Avenue,N,No


In [20]:
# Cleaning the rest of the columns

df['Paying Customer']=df['Paying Customer'].replace(['Y','N'],['Yes','No'])
df['Do_Not_Contact']=df['Do_Not_Contact'].replace(['Y','N'],['Yes','No'])
# df['Paying Customer']


In [21]:
# Splitting the address column

df[['Street','State','Pin']]= df['Address'].str.split(',', n=2, expand = True)

In [22]:
df = df.fillna('')
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Street,State,Pin
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes,93 West Main Street,,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Yes,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No,123 Dragons Road,,
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,768 City Parkway,,
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No,98 Clue Drive,,
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,25th Main Street,New York,
12,1013,Don,Draper,123-543-2345,2039 Main Street,Yes,No,2039 Main Street,,
13,1014,Leslie,Knope,876-678-3469,343 City Parkway,Yes,No,343 City Parkway,,
14,1015,Toby,Flenderson,304-762-2467,214 HR Avenue,No,No,214 HR Avenue,,


In [23]:
# Removing contacts that dont want to be contacted
for x in df.index:
    if df.loc[x , 'Do_Not_Contact']== 'Yes':
        df = df.drop(x)

df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Street,State,Pin
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,123 Shire Lane,Shire,
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No,123 Dragons Road,,
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No,98 Clue Drive,,
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,25th Main Street,New York,
12,1013,Don,Draper,123-543-2345,2039 Main Street,Yes,No,2039 Main Street,,
13,1014,Leslie,Knope,876-678-3469,343 City Parkway,Yes,No,343 City Parkway,,
14,1015,Toby,Flenderson,304-762-2467,214 HR Avenue,No,No,214 HR Avenue,,
15,1016,Ron,Weasley,123-545-5421,2395 Hogwarts Avenue,No,No,2395 Hogwarts Avenue,,
16,1017,Michael,Scott,123-643-9775,"121 Paper Avenue, Pennsylvania",Yes,No,121 Paper Avenue,Pennsylvania,
19,1020,Anakin,Skywalker,876-678-3469,"910 Tatooine Road, Tatooine",Yes,No,910 Tatooine Road,Tatooine,


In [24]:
df=df.reset_index(drop=True)

In [25]:
df.head(5)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Street,State,Pin
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,123 Shire Lane,Shire,
1,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No,123 Dragons Road,,
2,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No,98 Clue Drive,,
3,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,25th Main Street,New York,
4,1013,Don,Draper,123-543-2345,2039 Main Street,Yes,No,2039 Main Street,,
