# Natural Language Learning
Supervised learning with sklearn


## Introduction

### Review: data loading and exploration

In [1]:
##load the iris dataset directly from sklearn, note this is numpy data and not pandas data
from sklearn.datasets import load_iris
iris_np = load_iris()

In [2]:
X = iris_np.data
Y = iris_np.target

In [17]:
##extract the feature_names section
iris_np['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [None]:
##or extract this by using the sklearn features_names property
iris_np.feature_names

In [19]:
import pandas as pd
##create pandas dataframe for features using X and assign col heading using feature_names
irisX = pd.DataFrame(X, columns=iris_np.feature_names)

In [19]:
irisX.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [27]:
irisX['sepal length (cm)'][:5]

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal length (cm), dtype: float64

### Text Learning

Documents are rows of observations. Each document consists of words. These words are features. 
Vectorization is to translate document into a bag of words (with or without weights). If weight is needed, TF-IDF is one of the most common weighting algorithm. It is provided by sklearn.feature_extraction.text with TfidfVectorizer(). 

High level steps:
1. Split the orginal dataset into training and test subsets
2. Count vectorization of X datasets by fitting (learning the vocab) and transforming (generating word count) the datasets
3. Fit a machine learning model using the vectorized X train dataset and Y train
4. Predict with the transformed X test dataset

## A Simple Example

In [6]:
##a simple sample with 3 'documents'
simple_train = ['call me tonight', 'call me a cab', 'Please call me now.']
is_desperate = [0, 0, 1]

### Vectorization (bag of words)

In [7]:
##count vectorizor
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [8]:
##learn the vocab
vect.fit(simple_train)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
##show vocabs, ignore punctuations, remove stop words, caps, no dupes, sorted alpha, convert to unicode
##these are the features or terms, total 6 
featnames = vect.get_feature_names()
print(featnames)

[u'cab', u'call', u'me', u'now', u'please', u'tonight']


In [10]:
##convert to document-term matrix, rows=documents, cols=terms
##3 entries, 5 terms (tokens or features)
X_train_tr = vect.transform(simple_train)

In [11]:
##(0,1) = document 0, word 1 (second word); col 2 value 1 means the number of times the word appears in that doc.
##(0,1) 1 means doc 0, second word (call) is present one time.
##sparse representation only stores words that are non-zeros to save space
print(X_train_tr)

  (0, 1)	1
  (0, 2)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (2, 4)	1


In [12]:
##sparse matrix type
type(X_train_tr)

scipy.sparse.csr.csr_matrix

In [14]:
##convert sparse matrix to dense matrix
X_train_num = X_train_tr.toarray()
type(X_train_num)

numpy.ndarray

In [15]:
print X_train_num

[[0 1 1 0 0 1]
 [1 1 1 0 0 0]
 [0 1 1 1 1 0]]


In [56]:
##to interpret the matrix, use pd
pd.DataFrame(X_train_num, columns=featnames)

Unnamed: 0,cab,call,me,now,please,tonight
0,0,1,1,0,0,1
1,1,1,1,0,0,0
2,0,1,1,1,1,0


So the first row with only the cols with 1: call me tonight; the second row becomes 'cab call me'.

### Supervized Learning and Prediction

In [16]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=1)
KNN.fit(X_train_tr, is_desperate)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [21]:
simple_test = ["Please text me"]
simple_test_tr = vect.transform(simple_test)
simple_test_tr.toarray()

array([[0, 0, 1, 0, 1, 0]], dtype=int64)

In [22]:
pd.DataFrame(simple_test_tr.toarray(), columns=featnames)

Unnamed: 0,cab,call,me,now,please,tonight
0,0,0,1,0,1,0


In [23]:
KNN.predict(simple_test_tr)

array([1])

Now change the the test text to "Please don't call me". Notice that word "don't" is not listed because it was never learned so it does not know how to interpret that. Thus the prediction is incorrect.

## SMS Spam Detection

In [210]:
import pandas as pd
##reading a local file
sms = pd.read_table('SMSSpamCollection', header=None, names=['label', 'message'])

In [211]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [212]:
sms.shape

(5572, 2)

In [213]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [214]:
##newer versions of sklearn do not need the label being numerical
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [215]:
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [216]:
X = sms.message
Y = sms.label_num

In [217]:
##null spam rate
Y.mean()

0.13406317300789664

### Split dataset

In [218]:
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7)

In [219]:
type(X_train)

pandas.core.series.Series

### Vectorization

In [89]:
##count vectorizor
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(X_train)
X_train_tr = vect.transform(X_train)

In [90]:
X_test_tr = vect.transform(X_test)

### Naive Bayes

In [97]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [113]:
##fit the model using the transformed data and time it
% time nb.fit(X_train_tr, Y_train)

Wall time: 3 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [99]:
##predict using the transformed data
nb_pred = nb.predict(X_test_tr)

In [100]:
from sklearn import metrics
metrics.accuracy_score(Y_test, nb_pred)

0.98475336322869955

In [101]:
metrics.confusion_matrix(Y_test, nb_pred)

array([[965,   3],
       [ 14, 133]])

### Logistic Regression

In [114]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
% time LR.fit(X_train_tr, Y_train);

Wall time: 49 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [110]:
LR_pred = LR.predict(X_test_tr)

In [116]:
##predicted probability
proba = LR.predict_proba(X_test_tr)[:, 1]

In [117]:
from sklearn import metrics
metrics.accuracy_score(Y_test, LR_pred)

0.98116591928251118

In [118]:
##calculate AUC
metrics.roc_auc_score(Y_test, proba)

0.99600129307921514

## Sentiment Analysis

### Reading files

In [201]:
import os
data_dir ='txt_sentoken'
classes = ['pos', 'neg']
test_data = []
train_data = []
test_labels = []
train_labels = []

for curr_class in classes:
  dirname = os.path.join(data_dir, curr_class)
  for fname in os.listdir(dirname):
    with open(os.path.join(dirname, fname), 'r') as f:
      content = f.read()
      if fname.startswith('cv9'):
        test_data.append(content)
        test_labels.append(curr_class)
      else:
        train_data.append(content)
        train_labels.append(curr_class)


In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
##vectorize using TF-IDF
vect = TfidfVectorizer(min_df=5,                   ##discard words appear in less than 5 docs
                       max_df = 0.8,               ##discard words appering in more than 80% of the documents
                       sublinear_tf=True,          ##use sublinear weighting
                       use_idf=True)               ##enable IDF
train_vect = vect.fit_transform(train_data)
test_vect = vect.transform(test_data)

### Classification with SVM

In [144]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [149]:
SVM = SVC(kernel='linear')
SVM.fit(train_vect, train_labels)
SVM_pred = SVM.predict(test_vect)

In [160]:
classification_report(test_labels, SVM_pred.tolist())

'             precision    recall  f1-score   support\n\n        neg       0.91      0.92      0.92       100\n        pos       0.92      0.91      0.91       100\n\navg / total       0.92      0.92      0.91       200\n'

### Naive Bayes (using pandas)

In [221]:
##convert dataset into pandas 
import pandas as pd
X_train = pd.DataFrame(train_data)
Y_train = pd.DataFrame(train_labels)
X_test = pd.DataFrame(test_data)
Y_test = pd.DataFrame(test_labels)

In [222]:
X = pd.concat([X_train, X_test], axis=0)
Y = pd.concat([Y_train, Y_test], axis=0)


In [223]:
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7)

In [240]:
##count vectorizor
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
##need to convert dataframe into pandas.core.series.Series by select the first element, otherwise error
vect.fit(X_train[0])
X_train_tr = vect.transform(X_train[0])
X_test_tr = vect.transform(X_test[0])

In [241]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [243]:
##fit the model using the transformed data and time it
% time nb.fit(X_train_tr, Y_train[0])

Wall time: 40 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [244]:
##predict using the transformed data
nb_pred = nb.predict(X_test_tr)

In [245]:
from sklearn import metrics
metrics.accuracy_score(Y_test, nb_pred)

0.79500000000000004

In [246]:
metrics.confusion_matrix(Y_test, nb_pred)

array([[158,  29],
       [ 53, 160]])