# Text Classification

In [75]:
import nltk
import random
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\16504\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [76]:
from nltk.corpus import names
names = ([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in names.words('female.txt')])
print(names[:10])

[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male')]


In [77]:
# Shuffling Names
random.shuffle(names)

In [78]:
#Checking length
len(names)

7944

In [79]:
# Finding Index
setIndex = int(len(names)* 0.8)
setIndex

6355

In [80]:
def gender_features(word):
    """Function to find last two characters"""
    return {'last_letter': word[-2:]}


In [81]:
# Checking
gender_features('Shrek')

{'last_letter': 'ek'}

In [82]:
#Checking
(gender_features('Shrek'), 'male')

({'last_letter': 'ek'}, 'male')

In [83]:
# Extracting characters and Labels
featuresets = [(gender_features(n.lower()), g) for (n,g) in names]
featuresets[:10]

[({'last_letter': 'ie'}, 'female'),
 ({'last_letter': 'ia'}, 'female'),
 ({'last_letter': 'in'}, 'male'),
 ({'last_letter': 'is'}, 'male'),
 ({'last_letter': 'na'}, 'female'),
 ({'last_letter': 'tt'}, 'male'),
 ({'last_letter': 'av'}, 'male'),
 ({'last_letter': 'ta'}, 'female'),
 ({'last_letter': 'dy'}, 'male'),
 ({'last_letter': 'ey'}, 'male')]

In [84]:
# seperating to train and test set
train_set, test_set = featuresets[:setIndex], featuresets[setIndex:]

In [85]:
# Training using NaiveBayes Classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [86]:
# Classifying name 'Neo'
classifier.classify(gender_features('Neo'))

'male'

In [87]:
# Classifying name 'Trinity'
classifier.classify(gender_features('Trinity'))

'female'

In [88]:
# Checking Accuracy
print (nltk.classify.accuracy(classifier, test_set))

0.7721837633731907


In [89]:
# Finding most informative features
classifier.show_most_informative_features(10)


Most Informative Features
             last_letter = 'na'           female : male   =     88.3 : 1.0
             last_letter = 'la'           female : male   =     63.3 : 1.0
             last_letter = 'us'             male : female =     57.2 : 1.0
             last_letter = 'ia'           female : male   =     44.8 : 1.0
             last_letter = 'rd'             male : female =     34.3 : 1.0
             last_letter = 'ra'           female : male   =     31.2 : 1.0
             last_letter = 'ta'           female : male   =     26.8 : 1.0
             last_letter = 'rt'             male : female =     24.1 : 1.0
             last_letter = 'ch'             male : female =     23.2 : 1.0
             last_letter = 'do'             male : female =     21.0 : 1.0


`* This listing shows that the names in the training set that end in na are female 89 times more often than they are male, but names that end in rd are male 27 times more often than they are female. These ratios are known as likelihood ratios, and can be useful for comparing different feature-outcome relationships.`