## Text Classification


Develop a classifier to distinguish between a male name and female name

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [2]:
gender_features('Kirti')

{'last_letter': 'i'}

In [3]:
from nltk.corpus import names

In [7]:
names.words()

['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale',
 'Abra',
 'Acacia',
 'Ada',
 'Adah',
 'Adaline',
 'Adara',
 'Addie',
 'Addis',
 'Adel',
 'Adela',
 'Adelaide',
 'Adele',
 'Adelice',
 'Adelina',
 'Adelind',
 'Adeline',
 'Adella',
 'Adelle',
 'Adena',
 'Adey',
 'Adi',
 'Adiana',
 'Adina',
 'Adora',
 'Adore',
 'Adoree',
 'Adorne',
 'Adrea',
 'Adria',
 'Adriaens',
 'Adrian',
 'Adriana',
 'Adriane',
 'Adrianna',
 'Adrianne',
 'Adrien',
 'Adriena',
 'Adrienne',
 'Aeriel',
 'Aeriela',
 'Aeriell',
 'Ag',
 'Agace',
 'Agata',
 'Agatha',
 'Agathe',
 'Aggi',
 'Aggie',
 'Aggy',
 'Agna',
 'Agnella',
 'Agnes',
 'Agnese',
 'Agnesse',
 'Agneta',
 'Agnola',
 'Agretha',
 'Aida',
 'Aidan',
 'Aigneis',
 'Aila',
 'Aile',
 'Ailee',
 'Aileen',
 'Ailene',
 'Ailey',
 'Aili',
 'Ailina',
 'Ailyn',
 'Aime',
 'Aimee',
 'Aimil',
 'Aina',
 'Aindrea',
 'Ainslee',
 'Ainsley',
 'Ainslie',
 'Ajay',
 'Alaine',
 'Alameda',
 'Alana',
 'Alanah',
 'Alane',
 'Alanna',
 

In [6]:
import nltk
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     /Users/harshitaramesh/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [8]:
print(len(names.words()))

7944


In [11]:
# Establishing labels for this corpus
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [13]:
import random
random.shuffle(labeled_names)

In [15]:
featuresets = [(gender_features(n), gender) for (n,gender) in labeled_names]

In [16]:
train_set, test_set = featuresets[5000:], featuresets[:2000]

In [17]:
import nltk
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [18]:
print(train_set)

[({'last_letter': 'y'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'o'}, 'male'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 'b'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'i'}, 'male'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'h'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'd'}, 'male'), (

In [21]:
classifier.classify(gender_features('Trump'))

'male'

In [22]:
print(nltk.classify.accuracy(classifier, test_set))

0.773


## Vectorization

In [35]:
from sklearn.feature_extraction.text import CountVectorizer # TfidfVectorizor

In [38]:
vect = CountVectorizer(binary = True)
# A corpus is required for text comparison
corpus = ["This is natural language processing class", "Today we are learning about Text classification and vectorization"]
vect.fit(corpus)

vocab = vect.vocabulary_ # vocabulary_ will hold all the unique words in the corpus
print(vocab)
    
print(vect.fit_transform(corpus))

#from sklearn.metrics.pairwise import cosine_similarity
#similarity = cosine_similarity(vect.transform("","").)

{'this': 11, 'is': 5, 'natural': 8, 'language': 6, 'processing': 9, 'class': 3, 'today': 12, 'we': 14, 'are': 2, 'learning': 7, 'about': 0, 'text': 10, 'classification': 4, 'and': 1, 'vectorization': 13}
  (0, 11)	1
  (0, 5)	1
  (0, 8)	1
  (0, 6)	1
  (0, 9)	1
  (0, 3)	1
  (1, 12)	1
  (1, 14)	1
  (1, 2)	1
  (1, 7)	1
  (1, 0)	1
  (1, 10)	1
  (1, 4)	1
  (1, 1)	1
  (1, 13)	1


In [40]:
print(vect.transform(corpus))

  (0, 3)	1
  (0, 5)	1
  (0, 6)	1
  (0, 8)	1
  (0, 9)	1
  (0, 11)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 4)	1
  (1, 7)	1
  (1, 10)	1
  (1, 12)	1
  (1, 13)	1
  (1, 14)	1


In [43]:
from sklearn.metrics.pairwise import cosine_similarity

vect = CountVectorizer(binary = True)

corpus = ["I can see, feel and tell how one treats me","I appreciate all the beauty I can feel"]
vect.

similarity = cosine_similarity(vect.transform(corpus))

print(similarity.tolist())

NotFittedError: Vocabulary not fitted or provided