### Word comparison functions 

In [None]:
s.startswith(t)
s.endswith(t)
t in s
s.isupper(); s.islower(); s.istitle()
s.isalpha(); s.isdigit(); s.isalnum()

### String operations

In [None]:
s.lower(); s.upper(); s.titlecase()
s.split(t)
s.splitlines(t)
s.join(t)
s.strip(); s.rstrip()
s.find(t); s.rfind(t)
s.replace(u,v)

### Handling larger texts

In [None]:
# Reading files line by line
f = open('***','r')
f.readline()

# Reading the full file
f.seek(0)
text1 = f.read()

### File operations

In [None]:
f = open(filename, mode)
f.readline(); f.read(); f.read(n)
for line in f: doSomething(line)
f.seek(n)
f.write(message)
f.close()
f.closed

### Basic NLP Tasks with NLTK

In [1]:
import nltk
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### Frequency of words

In [5]:
dist = nltk.FreqDist(text7)
vocab1 = dist.keys()
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]

### Normalization and stemming

input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

### Lemmatization

In [12]:
udhr = nltk.corpus.udhr.words('English-Latin1')
[porter.stem(t) for t in udhr[:20]] # Still Lemmatization
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

[u'Universal',
 u'Declaration',
 u'of',
 u'Human',
 u'Rights',
 u'Preamble',
 u'Whereas',
 u'recognition',
 u'of',
 u'the',
 u'inherent',
 u'dignity',
 u'and',
 u'of',
 u'the',
 u'equal',
 u'and',
 u'inalienable',
 u'right',
 u'of']

### Tokenization

In [15]:
text11 = "Children shouldn't drink a sugary drink before bed."
nltk.word_tokenize(text11)
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)

### Part of Speech (POS) tagging

In [19]:
nltk.help.upenn_tagset('MD')
text13 = nltk.word_tokenize(text11)
nltk.pos_tag(text13)

# Parsing sentence structure
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would
(S (NP Alice) (VP (V loves) (NP Bob)))


### Sort a dictionary's value in descending order

In [None]:
dist = nltk.FreqDist(text1)
sorted_dist = sorted(dist.items(), key = operator.itemgetter(1), reverse=True)[:20]

### jaccard Distance on trigram

In [23]:
from nltk.corpus import words

correct_spellings = words.words()

def jaccard_distance(entries=['cormulent', 'incendenece', 'validrate']):
    
    result = {}
    for entry in entries:
        spelling_check = [s for s in correct_spellings if s.startswith(entry[0])] 
        distance = ((nltk.jaccard_distance(set(nltk.ngrams(entry,3)),
                                         set(nltk.ngrams(check, 3))), check) for check in spelling_check)
        closest = min(distance)
        result[closest[1]] = closest[0]
    
    
    return list(result.keys())

jaccard_distance()

[u'validate', u'corpulent', u'indecence']

### Edit distance on the two words with transpositions

In [25]:
from nltk.corpus import words

correct_spellings = words.words()

def edit_distance(entries=['cormulent', 'incendenece', 'validrate']):
    
    result = {}
    for entry in entries:
        spelling_check = [s for s in correct_spellings if s.startswith(entry[0])] 
        distance = ((nltk.edit_distance(entry, check), check) for check in spelling_check)
        closest = min(distance)
        result[closest[1]] = closest[0]
        
    return list(result.keys())

edit_distance()

[u'validate', u'corpulent', u'intendence']

### Use 2-4 columns to overwrite the nan values in 1st column

In [None]:
date_df = pd.to_datetime(case1.fillna(case2).fillna(case3).fillna(case4).replace('Decemeber','December',regex=True).replace('Janaury','January',regex=True))

### Case Study: Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)

# Most ratings are positive
df['Positively Rated'].mean()

from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

### Tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)

X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

### n-grams

In [None]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

### Add a feature

In [None]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

### WordNet through NLTK

In [1]:
import nltk
from nltk.corpus import wordnet as wn

#### Find appropriate sense of the words

In [4]:
deer = wn.synset('deer.n.01')
elk = wn.synset('elk.n.01')
horse = wn.synset('horse.n.01')

#### Find path similarity

In [5]:
deer.path_similarity(elk)
deer.path_similarity(horse)

0.14285714285714285

#### Use an information criteria to find Lin similarity

In [8]:
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')

deer.lin_similarity(elk, brown_ic)
#deer.lin_similarity(horse, brown_ic)

0.8623778273893673

### NLTK Collocations and Association measures

In [None]:
import nltk
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(text)
find.nbest(bigram_measures.pmi, 10)

# finder also has other userful functions, such as frequency finder
finder.apply_freq_filter(10)

### Latent Dirichlet Allocation (LDA)

* Geerative model for a document d

  * Choose length of document d
  * Choose a mixture of topics for document d
  * Use a topic's multinomial distribution to output words to fill that topic's quota

In [None]:
# doc_set: set of pre-processed text documents
import gensim
from gensim import corpora, models

dictionary = corpora.Dictionary(doc_set)
corpus = [dictionary.doc2bow(doc) for doc in doc_set]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4, id2word = dictionary, passes = 50)
print(ldamodel.print_topics(num_topics = 4, num_words = 5))

# ldamodel can also be used to find topic distribution of document
topic_dis = ldamodel[new_doc]