In [36]:
import nltk

In [37]:
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer

### Tokenizing

In [38]:
input_text = "I am planning to buy a new branded sports car and doing lot of research on it. Possiblly I will buy next year. Do you know any sports cars"

In [39]:
print ("Sentense tokenizer")
print (sent_tokenize(input_text))

Sentense tokenizer
['I am planning to buy a new branded sports car and doing lot of research on it.', 'Possiblly I will buy next year.', 'Do you know any sports cars']


In [40]:
print ("Word tokenizer")
print (word_tokenize(input_text))

Word tokenizer
['I', 'am', 'planning', 'to', 'buy', 'a', 'new', 'branded', 'sports', 'car', 'and', 'doing', 'lot', 'of', 'research', 'on', 'it', '.', 'Possiblly', 'I', 'will', 'buy', 'next', 'year', '.', 'Do', 'you', 'know', 'any', 'sports', 'cars']


In [41]:
print ("Word punct tokenizer")
words = WordPunctTokenizer().tokenize(input_text)
print (words)

Word punct tokenizer
['I', 'am', 'planning', 'to', 'buy', 'a', 'new', 'branded', 'sports', 'car', 'and', 'doing', 'lot', 'of', 'research', 'on', 'it', '.', 'Possiblly', 'I', 'will', 'buy', 'next', 'year', '.', 'Do', 'you', 'know', 'any', 'sports', 'cars']


### Converting words to their Base forms
Stemming Algorithems</br>
Porter    - Least strict (slow)</br>
Lancaster - Strictest (Fast)</br>
Snowball  - Moderate (Fast)</br>

In [42]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [43]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')


In [44]:
stemmer_names = ["PORTER", "LANCASTER", "SNOWBALL"]
formatted_text = '{:>16}' * (len(stemmer_names) + 1)
print ('\n', formatted_text.format('INPUT Word', *stemmer_names), '\n','='*68)
for word in words:
    output = [word, porter.stem(word), lancaster.stem(word), snowball.stem(word)]
    print ('\n', formatted_text.format(word, *output), '\n','='*68)



       INPUT Word          PORTER       LANCASTER        SNOWBALL 

                I               I               I               i 

               am              am              am              am 

         planning        planning            plan            plan 

               to              to              to              to 

              buy             buy             buy             buy 

                a               a               a               a 

              new             new             new             new 

          branded         branded           brand           brand 

           sports          sports           sport           sport 

              car             car             car             car 

              and             and             and             and 

            doing           doing              do           doing 

              lot             lot             lot             lot 

               of              of              

### Lemmatizer

In [45]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer_names = ["NOUN LEMMATIZER", "VERB LEMMATIZER"]
format_text = '{:24}' * (len(lemmatizer_names) + 1)
print ('\n', format_text.format('INPUT WORD', *lemmatizer_names),'\n','='*75)
for word in words:
    output = [word, lemmatizer.lemmatize(word, pos='n'), lemmatizer.lemmatize(word, pos='v')]
    print (format_text.format(*output))


 INPUT WORD              NOUN LEMMATIZER         VERB LEMMATIZER          
I                       I                       I                       
am                      am                      be                      
planning                planning                plan                    
to                      to                      to                      
buy                     buy                     buy                     
a                       a                       a                       
new                     new                     new                     
branded                 branded                 brand                   
sports                  sport                   sport                   
car                     car                     car                     
and                     and                     and                     
doing                   doing                   do                      
lot                     lot                     

### Dividing text data into chunks

In [46]:
from nltk.corpus import brown
import numpy as np
def chunker(input_data, N):
    input_words = input_data.split(' ')
    output = []
    cur_chunk = []
    count = 0 
    for word in input_words:
        cur_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(cur_chunk))
            count, cur_chunk = 0, []
    output.append(' '.join(cur_chunk))
    return output
input_data = ' '.join(brown.words()[:12000])
chunk_size = 700
chunks = chunker(input_data, chunk_size)
print ("\nNumber of text chunks =",len(chunks), "\n")
for i, chunk in enumerate(chunks):
    print ('Chunk', i+1, '==>', chunk[:50])


Number of text chunks = 18 

Chunk 1 ==> The Fulton County Grand Jury said Friday an invest
Chunk 2 ==> '' . ( 2 ) Fulton legislators `` work with city of
Chunk 3 ==> . Construction bonds Meanwhile , it was learned th
Chunk 4 ==> , anonymous midnight phone calls and veiled threat
Chunk 5 ==> Harris , Bexar , Tarrant and El Paso would be $451
Chunk 6 ==> set it for public hearing on Feb. 22 . The proposa
Chunk 7 ==> College . He has served as a border patrolman and 
Chunk 8 ==> of his staff were doing on the address involved co
Chunk 9 ==> plan alone would boost the base to $5,000 a year a
Chunk 10 ==> nursing homes In the area of `` community health s
Chunk 11 ==> of its Angola policy prove harsh , there has been 
Chunk 12 ==> system which will prevent Laos from being used as 
Chunk 13 ==> reform in recipient nations . In Laos , the admini
Chunk 14 ==> . He is not interested in being named a full-time 
Chunk 15 ==> said , `` to obtain the views of the general publi
Chunk 16 ==> '' . M

### Bag of words model - document-term metrix

In [47]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import brown
from text_chunker import chunker

input_data =' '.join(brown.words()[:5400])
chunk_size = 800
text_chunks = chunker(input_data, chunk_size)
chunks = []
for count, chunk in enumerate(text_chunks):
    d = {'index': count, 'text': chunk}
    chunks.append(d)
count_vectorizer = CountVectorizer(min_df=7, max_df=20)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])
vocabulary = np.array(count_vectorizer.get_feature_names())
print ("\nVocabulary:\n", vocabulary)
chunk_names = []
for i in range(len(text_chunks)):
    chunk_names.append('Chunk-'+str(i))
    
print ("\nDocument Term matrix:")
formatted_text = '{:>12}'*(len(chunk_names)+1)
print ("\n", formatted_text.format('Word', *chunk_names), '\n')
for word,item in zip(vocabulary, document_term_matrix.T):
    output = [word] + [str(freq) for freq in item.data]
    print (formatted_text.format(*output))
    

ImportError: No module named 'text_chunker'

### Category predictor - bag of words model
### tf-idf (TermFrequency-InverseDcoumentFrequency)

In [49]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

category_map = {'talk.politics.misc': 'Politics', 'rec.autos': 'Autos',
               'rec.sport.hockey':'Hockey', 'sci.electronics': 'Electronics',
               'sci.med': 'Medicine'}

training_data = fetch_20newsgroups(subset='train', categories=category_map.keys(), shuffle=True, random_state=5)
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data :", train_tc.shape)
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

# Define data sets
input_data = [
    "You need to be careful with cars when you are driving on slippery roads",
    "A lot of devices can be operated wirelessly",
    "Players need to be careful when they are close to goal posts",
    "Political debates help us undestand the prespectives of both sides",
    "When you are crossing junctions, be careful",
    "my chairs are very old and I want to change them"
]
classifier = MultinomialNB().fit(train_tfidf, training_data.target)
input_tc = count_vectorizer.transform(input_data)
input_tfidf = tfidf.transform(input_tc)
predictions = classifier.predict(input_tfidf)
for sent, category in zip(input_data,predictions):
    print ("\nInput :", sent, '\nPredicted category :', category_map[training_data.target_names[category]])


Dimensions of training data : (2844, 40321)

Input : You need to be careful with cars when you are driving on slippery roads 
Predicted category : Autos

Input : A lot of devices can be operated wirelessly 
Predicted category : Electronics

Input : Players need to be careful when they are close to goal posts 
Predicted category : Hockey

Input : Political debates help us undestand the prespectives of both sides 
Predicted category : Politics

Input : When you are crossing junctions, be careful 
Predicted category : Autos

Input : my chairs are very old and I want to change them 
Predicted category : Autos


### Constructing a Gender Identifier
#### Using heuristic to construct feature vector
#### Using it to train classifier
##### Name ends with ia - female name ( Amelia or Genelia)
##### Name ends with rk - make name (Mark or Clark)

In [None]:
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names
import random

In [None]:
def extract_features(word, N=2):
    last_n_letters = word[-N:]
    return {'feature': last_n_letters.lower()}

In [None]:
male_list = [(name, 'male') for name in names.words('male.txt')]
female_list = [(name, 'female') for name in names.words('female.txt')]
data = (male_list + female_list)

In [48]:
random.seed(5)
random.shuffle(data)
input_names = ['Alexander', 'Danielle', 'David','Cheryl', 'prabhakar', 'harsha']
num_train = int(0.8*len(data))
for i in range(1,9):
    print ("\nNumber of end letters :", i)
    features = [(extract_features(n,i), gender) for (n,gender) in data]
    train_data,test_data = features[:num_train], features[num_train:]
    classifier = NaiveBayesClassifier.train(train_data)
    accuracy = round(100*nltk_accuracy(classifier, test_data), 2)
    print ("Accuracy ="+str(accuracy)+'%')
    for name in input_names:
        print (name, "==>", classifier.classify(extract_features(name,i)))


Number of end letters : 1
Accuracy =75.9%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> male
prabhakar ==> male
harsha ==> female

Number of end letters : 2
Accuracy =77.53%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female
prabhakar ==> male
harsha ==> female

Number of end letters : 3
Accuracy =76.53%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female
prabhakar ==> female
harsha ==> female

Number of end letters : 4
Accuracy =70.17%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female
prabhakar ==> female
harsha ==> female

Number of end letters : 5
Accuracy =64.44%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female
prabhakar ==> female
harsha ==> female

Number of end letters : 6
Accuracy =61.23%
Alexander ==> female
Danielle ==> female
David ==> male
Cheryl ==> female
prabhakar ==> female
harsha ==> female

Number of end letters : 7
Accuracy =59.97%
Alexander ==> female
Danielle =

#### Topic modeling

CRF model for predecting the correct words