In [31]:
import pandas as pd
import re

In [84]:
data_loc = './data/'
clean_loc = './data/clean/'
d_sets = ['Indian-Male-Names.csv','Indian-Female-Names.csv']
d_sets

['Indian-Male-Names.csv', 'Indian-Female-Names.csv']

## Mapping Info

In [91]:
map_info = {'gender': {'m':0,'f':1}, 'race' : {'b':0,'black':0,'hispanic':1,'white':2,'indian':3}}
map_info

{'gender': {'m': 0, 'f': 1},
 'race': {'b': 0, 'black': 0, 'hispanic': 1, 'white': 2, 'indian': 3}}

In [21]:
def disp_d(name,d_loc):
    d2 = pd.read_csv(d_loc+name)
    print(name,'\n',d2.head())
    print('size :',len(d2))
    print('-----------------')

In [23]:
for i in d_sets:
    disp_d(i,data_loc)

Indian-Male-Names.csv 
               name gender    race
0          barjraj      m  indian
1     ramdin verma      m  indian
2  sharat chandran      m  indian
3  birender mandal      m  indian
4             amit      m  indian
size : 14845
-----------------
Indian-Female-Names.csv 
               name gender    race
0          shivani      f  indian
1             isha      f  indian
2  smt shyani devi      f  indian
3            divya      f  indian
4            mansi      f  indian
size : 15382
-----------------


In [15]:
ind_m = pd.read_csv(data_loc+d_sets[0])
ind_f = pd.read_csv(data_loc+d_sets[1])

### 1) Check for NULL Values

In [13]:
ind_m.isna().sum()

name      24
gender     0
race       0
dtype: int64

In [16]:
ind_f.isna().sum()

name      31
gender     0
race       0
dtype: int64

### 2) Check for Duplicates

In [19]:
ind_m.duplicated().sum()

6325

In [20]:
ind_f.duplicated().sum()

8608

## Drop duplicates

In [21]:
ind_f.drop_duplicates(inplace=True)

In [22]:
len(ind_f)

6774

In [23]:
ind_m.drop_duplicates(inplace=True)
len(ind_m)

8520

In [27]:
ind_m.dropna(inplace=True)
len(ind_m)

8519

In [29]:
ind_f.dropna(inplace=True)
len(ind_m)

8519

### Prepocess

In [24]:
def name_prep(word):
    return re.sub(r'[^a-z.\s]+','',word)

#### Creates preprocessed,cleaned dataset csv files

In [101]:
def preprocess_ds(name,sp=True):
    d2 = pd.read_csv(data_loc+name)
    d3 = pd.DataFrame([])
    if(sp):
        d3['name'] = d2.iloc[:,1]
    else:
        d3['name'] = d2.name
    d3.name = d3.name.apply(lambda x:name_prep(str(x)))
    d3['gender'] = d2.gender.apply(lambda x:map_info['gender'][x])
    d3['race'] = d2.race.apply(lambda x:map_info['race'][x])
    
    print(name,'\n')
    print('Before :',len(d3))
    print('Duplicates :', d3.duplicated().sum())
    print('Null :', d3.isna().sum())
    d3.drop_duplicates(inplace=True)
    d3.dropna(inplace=True)
    print('After :',len(d3))
    #d3.head()
    d3.to_csv(clean_loc+name, index=False)
    print('-----------------')

In [102]:
sp_sets = ['Black-Male-Names.csv','Black-Female-Names.csv','Hispanic-Male-Names.csv','Hispanic-Female-Names.csv','White-Male-Names.csv','White-Female-Names.csv']
sp_sets

['Black-Male-Names.csv',
 'Black-Female-Names.csv',
 'Hispanic-Male-Names.csv',
 'Hispanic-Female-Names.csv',
 'White-Male-Names.csv',
 'White-Female-Names.csv']

In [106]:
for i in sp_sets:
    preprocess_ds(i)

Black-Male-Names.csv 

Before : 35081
Duplicates : 19862
Null : name      0
gender    0
race      0
dtype: int64
After : 15219
-----------------
Black-Female-Names.csv 

Before : 2438
Duplicates : 243
Null : name      0
gender    0
race      0
dtype: int64
After : 2195
-----------------
Hispanic-Male-Names.csv 

Before : 4166
Duplicates : 1982
Null : name      0
gender    0
race      0
dtype: int64
After : 2184
-----------------
Hispanic-Female-Names.csv 

Before : 217
Duplicates : 15
Null : name      0
gender    0
race      0
dtype: int64
After : 202
-----------------
White-Male-Names.csv 

Before : 44048
Duplicates : 31022
Null : name      0
gender    0
race      0
dtype: int64
After : 13026
-----------------
White-Female-Names.csv 

Before : 4600
Duplicates : 1857
Null : name      0
gender    0
race      0
dtype: int64
After : 2743
-----------------


In [107]:
for i in d_sets:
    preprocess_ds(i,False)

Indian-Male-Names.csv 

Before : 14845
Duplicates : 6366
Null : name      0
gender    0
race      0
dtype: int64
After : 8479
-----------------
Indian-Female-Names.csv 

Before : 15382
Duplicates : 8679
Null : name      0
gender    0
race      0
dtype: int64
After : 6703
-----------------


### FInal results

In [108]:
for i in sp_sets+d_sets:
    disp_d(i,clean_loc)

Black-Male-Names.csv 
             name  gender  race
0       aaric a        0     0
1   dominique j        0     0
2    fredrick r        0     0
3       jarvis         0     0
4      lorenzo         0     0
size : 15219
-----------------
Black-Female-Names.csv 
             name  gender  race
0   tashanika l        1     0
1     denetra c        1     0
2     tomesha d        1     0
3    trellany c        1     0
4     cynthia s        1     0
size : 2195
-----------------
Hispanic-Male-Names.csv 
            name  gender  race
0   jonathan r        0     1
1   jonathan a        0     1
2       pablo         0     1
3    basilio l        0     1
4      german         0     1
size : 2184
-----------------
Hispanic-Female-Names.csv 
           name  gender  race
0   shirley m        1     1
1       ana d        1     1
2     miriam         1     1
3     ivette         1     1
4      saray         1     1
size : 202
-----------------
White-Male-Names.csv 
            name  gender  race