# Gender Identification.

### &emsp; &emsp; 1. Import Libraries.

In [1]:
import nltk
import random

### &emsp; &emsp; 2. Importing Names.

In [2]:
from nltk.corpus import names

# Listing the names.
Male_Names = names.words('male.txt')
Female_Names = names.words('female.txt')

# Print sample names.
print('Sample Male Names:-   ',Male_Names[0:5])
print('Sample Female Names:- ',Female_Names[0:5])

Sample Male Names:-    ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot']
Sample Female Names:-  ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi']


### &emsp; &emsp; 3. Taging Male, Female & Shuffle The Names.

In [3]:
# Creating a list of Male & Female names with Tags.
Name_List = [(Name , 'male') for Name in Male_Names] + [(Name , 'female') for Name in Female_Names]

# Print sample names.
print(Name_List[:5],'\n')
print(Name_List[-5:])

random.shuffle(Name_List)
# Print sample names.

print('\n #### Suffled names:-')
Name_List[-5:]

[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male')] 

[('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]

 #### Suffled names:-


[('Doug', 'male'),
 ('Maryellen', 'female'),
 ('Dwayne', 'male'),
 ('Liam', 'male'),
 ('Gunter', 'male')]

### &emsp; &emsp; 4. Creating Features.

In [4]:
def Name_Feature (name):
    return {
        'last_char': name[-1],
        'last_two_char': name[-2:],
        'last_three_char': name[-3:],
        'first_char': name[0],
        'first_two_char': name[:2],
        'first_three_char': name[:3]
    }

features = [ (Name_Feature(name= NAME), Gender) for (NAME, Gender) in Name_List ]
features[:1]

[({'last_char': 'e',
   'last_two_char': 'fe',
   'last_three_char': 'ffe',
   'first_char': 'R',
   'first_two_char': 'Ra',
   'first_three_char': 'Rad'},
  'male')]

### &emsp; &emsp; 5. Split The Data Into Training and Testing Set.

In [5]:
Training_Set = features[ :round(len(features) * .7)]
Testing_Set = features[round(len(features) * .7): ]

print(len(Training_Set))
print(len(Testing_Set))

5561
2383


### &emsp; &emsp; 6. Training The Model.

In [6]:
classifier = nltk.NaiveBayesClassifier.train(Training_Set)

### &emsp; &emsp; 7. Testing The Accuracy.

In [7]:
round(nltk.classify.accuracy(classifier , Testing_Set) * 100, 2)

84.39

### &emsp; &emsp; 8. Prediction.

In [8]:
classifier.classify(Name_Feature('tom'))

'male'

### &emsp; &emsp; 9. Training The Model On Full Data.

In [9]:
Final_Classifier_Model = nltk.NaiveBayesClassifier.train(features)

##  Save the model as a serialized file which can be stored anywhere:-

In [10]:
import pickle
import os

# Saving the Python objects as serialized files.

with open('Final_Classifier_Model.pkl', 'wb') as fileWriteStream:
    pickle.dump(Final_Classifier_Model, fileWriteStream)
    fileWriteStream.close()
    
print('pickle file of Predictive Model is saved at Location:',os.getcwd())

pickle file of Predictive Model is saved at Location: C:\Users\Pranab_Kumar_Paul\Desktop\Python_Script\My_Script\Python_Project(GitHub)\UnSupervised__(Text_Data)


## Create a python function:-

In [11]:
def Identify_Person_Gender(Text):
    import re
    import pandas as pd
    import nltk
    from nltk.tokenize import word_tokenize, sent_tokenize
    
    ### Import The Final Model.
    import pickle
    with open('Final_Classifier_Model.pkl', 'rb') as fileReadStream:
        Prediction_Model=pickle.load(fileReadStream)
        fileReadStream.close() 
        
    Final_Name= []
    Final_Gender= []
    
    ### Tokenize To Sentence --
    Sent_List = sent_tokenize(Text.title())
    
    for sent in Sent_List:
        ### Tokenize the Sentence --
        word_list = word_tokenize(sent)

        ### POS tagging
        pos_tags = nltk.pos_tag(word_list)

        ### Extract The Name.
        Person_Name= [name[0] for name in pos_tags if name[1] in ['NNP','NN']]
        Person_Name= re.sub(r'[^a-z A-Z . ,]',r'',str(Person_Name))

        Person_Name_feature= {  'last_char': Person_Name[-1],
                                'last_two_char': Person_Name[-2:],
                                'last_three_char': Person_Name[-3:],
                                'first_char': Person_Name[0],
                                'first_two_char': Person_Name[:2],
                                'first_three_char': Person_Name[:3] 
                             } 

        # Genrating Predictions
        Gender_Prediction=Prediction_Model.classify(Person_Name_feature)
        
        Final_Name.append(Person_Name)
        Final_Gender.append(Gender_Prediction)        
    
    prediction_result= pd.DataFrame({'Name':Final_Name, 'Gender': Final_Gender})
    
    return(prediction_result)

## Calling The Function.

In [12]:
Identify_Person_Gender('pranab is now working. what about Rita. tom is sleeping')

Unnamed: 0,Name,Gender
0,Pranab,male
1,Rita,female
2,Tom,male
