In [3]:
import pandas as pd
import re

In [35]:
data_loc = './data/'
clean_loc = './data/clean/'
d_sets = ['Indian-Male-Names.csv','Indian-Female-Names.csv']
d_sets

['Indian-Male-Names.csv', 'Indian-Female-Names.csv']

## Mapping Info

In [36]:
map_info = {'gender': {'m':0,'f':1}, 'race' : {'b':0,'black':0,'hispanic':1,'white':2,'indian':3}}
map_info

{'gender': {'m': 0, 'f': 1},
 'race': {'b': 0, 'black': 0, 'hispanic': 1, 'white': 2, 'indian': 3}}

In [37]:
def disp_d(name,d_loc):
    d2 = pd.read_csv(d_loc+name)
    print(name,'\n',d2.head())
    print('size :',len(d2))
    print('-----------------')

In [38]:
for i in d_sets:
    disp_d(i,data_loc)

Indian-Male-Names.csv 
               name gender    race
0          barjraj      m  indian
1     ramdin verma      m  indian
2  sharat chandran      m  indian
3  birender mandal      m  indian
4             amit      m  indian
size : 14845
-----------------
Indian-Female-Names.csv 
               name gender    race
0          shivani      f  indian
1             isha      f  indian
2  smt shyani devi      f  indian
3            divya      f  indian
4            mansi      f  indian
size : 15382
-----------------


In [39]:
ind_m = pd.read_csv(data_loc+d_sets[0])
ind_f = pd.read_csv(data_loc+d_sets[1])

### 1) Check for NULL Values

In [40]:
ind_m.isna().sum()

name      24
gender     0
race       0
dtype: int64

In [41]:
ind_f.isna().sum()

name      31
gender     0
race       0
dtype: int64

### 2) Check for Duplicates

In [42]:
ind_m.duplicated().sum()

6325

In [43]:
ind_f.duplicated().sum()

8608

## Drop duplicates

In [44]:
ind_f.drop_duplicates(inplace=True)

In [45]:
len(ind_f)

6774

In [46]:
ind_m.drop_duplicates(inplace=True)
len(ind_m)

8520

In [47]:
ind_m.dropna(inplace=True)
len(ind_m)

8519

In [48]:
ind_f.dropna(inplace=True)
len(ind_m)

8519

### Prepocess

#### Removes
* numbers and other non alphabets characters
* initials
* sentence start with along, with
* removes address from name

In [62]:
def name_prep(word):
    #te = re.sub(r'/.*|@.*','',word).strip()
    te = str(word).lower()
    te = re.sub(r'[^a-z.\s].*','',te).strip()
    te = re.sub(r'[^a-z ]+','',te).strip()
    te = re.sub(r' +',' ',te).strip()
    te = re.sub(r'along.*|with.*|and.*',' ',te).strip()
    #te = re.sub(r'\b\w$|\b\w\w$','',te).strip()
    te = re.sub(r'\b\w\b|\b\w\w\b','',te).strip()
    te = re.sub(r'na|NaN','',te).strip()
    if(len(te) < 3):
        return ''
    else:
        return te

In [63]:
name_prep('priyanka bagcdani nn d/o gulab bagcdani r/o 11/95 c.h')

'priyanka bagcdani'

In [66]:
name_prep('A manoJ  kumar , jawalaprasad, r/o j.j. colony sawada,')

'manoj kumar'

#### Creates preprocessed,cleaned dataset csv files

In [67]:
def preprocess_ds(name,sp=True):
    d2 = pd.read_csv(data_loc+name)
    d3 = pd.DataFrame([])
    if(sp):
        d3['name'] = d2.iloc[:,1]
    else:
        d3['name'] = d2.name
    d3.name = d3.name.apply(lambda x:name_prep(str(x)))
    d3['gender'] = d2.gender.apply(lambda x:map_info['gender'][x])
    d3['race'] = d2.race.apply(lambda x:map_info['race'][x])
    
    print(name,'\n')
    print('Before :',len(d3))
    print('Duplicates :', d3.duplicated().sum())
    print('Null :', d3.isna().sum())
    d3.drop_duplicates(inplace=True)
    d3.dropna(inplace=True)
    print('After :',len(d3))
    #d3.head()
    d3.to_csv(clean_loc+name, index=False)
    print('-----------------')

In [68]:
sp_sets = ['Black-Male-Names.csv','Black-Female-Names.csv','Hispanic-Male-Names.csv','Hispanic-Female-Names.csv','White-Male-Names.csv','White-Female-Names.csv']
sp_sets

['Black-Male-Names.csv',
 'Black-Female-Names.csv',
 'Hispanic-Male-Names.csv',
 'Hispanic-Female-Names.csv',
 'White-Male-Names.csv',
 'White-Female-Names.csv']

In [None]:
for i in sp_sets:
    preprocess_ds(i)

In [55]:
for i in d_sets:
    preprocess_ds(i,False)

Indian-Male-Names.csv 

Before : 14845
Duplicates : 7439
Null : name      0
gender    0
race      0
dtype: int64
After : 7406
-----------------
Indian-Female-Names.csv 

Before : 15382
Duplicates : 9665
Null : name      0
gender    0
race      0
dtype: int64
After : 5717
-----------------


### FInal results

In [56]:
for i in sp_sets+d_sets:
    disp_d(i,clean_loc)

Black-Male-Names.csv 
         name  gender  race
0      aaric       0     0
1  dominique       0     0
2   fredrick       0     0
3     jarvis       0     0
4    lorenzo       0     0
size : 6981
-----------------
Black-Female-Names.csv 
         name  gender  race
0  tashanika       1     0
1    denetra       1     0
2    tomesha       1     0
3   trellany       1     0
4    cynthia       1     0
size : 1436
-----------------
Hispanic-Male-Names.csv 
       name  gender  race
0   jothan       0     1
1    pablo       0     1
2  basilio       0     1
3   german       0     1
4   javier       0     1
size : 872
-----------------
Hispanic-Female-Names.csv 
       name  gender  race
0  shirley       1     1
1      NaN       1     1
2   miriam       1     1
3   ivette       1     1
4    saray       1     1
size : 168
-----------------
White-Male-Names.csv 
        name  gender  race
0     billy       0     2
1   charles       0     2
2  clarence       0     2
3    justin       0     2
4  