<a href="https://colab.research.google.com/github/slee987/LIS640tmp/blob/main/hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1 Install Packages if Needed

Run the following cells to install/upgrade the required packages and check if the installed versions meet the requirements.

In [None]:
# Upgrade packages in Google Colab.

# upgrade pip
!pip3 install pip --upgrade

# upgrade spacy 3.0.x
!pip3 install 'spacy>=3.0.0,<3.1.0' --upgrade

# upgrade scikit-learn 0.24.x
!pip3 install 'scikit-learn>=0.24.0,<0.25.0' --upgrade

# download the latest spacy model
!python3 -m spacy download en_core_web_sm

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 12.7MB/s 
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.3.1
    Uninstalling pip-19.3.1:
      Successfully uninstalled pip-19.3.1
Successfully installed pip-21.0.1
Collecting spacy<3.1.0,>=3.0.0
  Downloading spacy-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 308 kB/s 
Collecting spacy-legacy<3.1.0,>=3.0.0
  Downloading spacy_legacy-3.0.2-py2.py3-none-any.whl (7.8 kB)
Collecting thinc<8.1.0,>=8.0.2
  Downloading thinc-8.0.2-cp37-cp37m-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 63.4 MB/s 
Collecting pathy>=0.3.5
  Downloading pathy-0.4.0-py3-none-any.whl (36 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.3-cp37-cp37

In [None]:
# Check the installed versions.
# We require: spacy == 3.0.x ; scikit-learn == 0.24.x ; spacy's en-core-web-sm == 3.0.x .

import re

pkgs = !pip3 list
versions = { pkg.split()[0]:pkg.split()[1] for pkg in pkgs if re.match( '(spacy\s.+)|(scikit-learn\s.+)|(en-core-web-sm\s.+)', pkg ) }

assert versions['spacy'][0:3] == '3.0'
assert versions['scikit-learn'][0:4] == '0.24'
assert versions['en-core-web-sm'][0:3] == '3.0'

## 2 Feature Extraction for Unigrams and Bigrams

In [None]:
import spacy
import sklearn
import sklearn.metrics
from collections import Counter

nlp = spacy.load( "en_core_web_sm", disable=["parser", "ner"] )

# Transform a rawtext into a list of unigrams using spacy. Punctuations and stop words will be replaced by a [OOV] token.
def text2unigrams( rawtext, nlp ):
    return [ '[OOV]' if token.is_stop or token.is_punct else token.lemma_.lower() for token in nlp(rawtext) ]

# Generate a list of bigrams from a list of unigrams. We require none of the tokens in bigrams are [OOV].
def unigrams2bigrams( unigrams ):
    return [ unigrams[i]+'_'+unigrams[i+1] for i in range(len(unigrams)-1) if unigrams[i]!='[OOV]' and unigrams[i+1]!='[OOV]' ]

# Count the frequency of a list features.
def count_freq( features ):
    counts = Counter( features )
    counts.pop('[OOV]', None)
    return counts

# Count the presence of a list features.
def count_occur( features ):
    counts = Counter( set( features ) )
    counts.pop('[OOV]', None)
    return counts

# Generate readable evaluation results.
def readable_eval( Y_test, Y_pred, labels ):
    prf = sklearn.metrics.precision_recall_fscore_support( Y_test, Y_pred, labels=labels )
    metrics = {}
    metrics.update( { 'Precision (%s)'%label:prec for (prec,label) in zip(prf[0], labels) } )
    metrics.update( { 'Recall (%s)'%label:rec for (rec,label) in zip(prf[1], labels) } )
    metrics.update( { 'F1 (%s)'%label:f1 for (f1,label) in zip(prf[2], labels) } )
    metrics['Accuracy'] = sklearn.metrics.accuracy_score( Y_test, Y_pred )
    return metrics

## 3 Load and Preprocess Pang et al. (2002)'s dataset.

Note that the following step will take 1-2 minutes. You are encouraged to reuse data['unigrams'] and data['bigrams'] instead of preprocess the texts multiple times.

In [None]:
import pandas as pd

data = pd.read_csv( 'https://jiepujiang.github.io/data/pang2002.csv', index_col=0 )

data['unigrams'] = [ text2unigrams(text, nlp) for text in data['text'] ]
data['bigrams'] = [ unigrams2bigrams(unigrams) for unigrams in data['unigrams'] ]

data

Unnamed: 0,fold,label,text,unigrams,bigrams
cv004_tok-29856.txt,1,pos,"all great things come to an end , and the dot-...","[[OOV], great, thing, come, [OOV], [OOV], end,...","[great_thing, thing_come, com_era, era_embodie..."
cv409_tok-11193.txt,2,pos,i'm not quite sure how best to go about writin...,"[[OOV], [OOV], [OOV], [OOV], sure, [OOV], good...","[little_disappointed, barry_levinson, politica..."
cv045_tok-29121.txt,1,pos,"the others ( 2001 ) nicole kidman , christophe...","[[OOV], [OOV], [OOV], 2001, [OOV], nicole, kid...","[nicole_kidman, christopher_eccleston, fionnul..."
cv279_tok-15969.txt,2,pos,director : tony scott writer : david marconi s...,"[director, [OOV], tony, scott, writer, [OOV], ...","[tony_scott, scott_writer, david_marconi, marc..."
cv387_tok-4672.txt,2,pos,one of the most entertaining james bond films ...,"[[OOV], [OOV], [OOV], [OOV], entertaining, jam...","[entertaining_jame, jame_bond, bond_film, roge..."
...,...,...,...,...,...
cv562_tok-26379.txt,3,neg,directed by : jan de bont written by : david s...,"[direct, [OOV], [OOV], jan, de, bont, write, [...","[jan_de, de_bont, bont_write, david_shelf, shi..."
cv000_tok-9611.txt,1,neg,"tristar / 1 : 30 / 1997 / r ( language , viole...","[tristar, [OOV], 1, [OOV], 30, [OOV], 1997, [O...","[dennis_rodman, claude_van, van_damme, mickey_..."
cv571_tok-11568.txt,3,neg,director : michael caton-jones writer : chuck ...,"[director, [OOV], michael, caton, [OOV], jones...","[michael_caton, jones_writer, chuck_pfarrer, k..."
cv210_tok-15092.txt,1,neg,wrongfully accused reviewed by jamie peck<hr>r...,"[wrongfully, accuse, review, [OOV], jamie, pec...","[wrongfully_accuse, accuse_review, jamie_peck,..."


## 4 Prepare Features for Scikit-learn

In [None]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import feature_selection
from collections import Counter

# Extract and select unigram or bigram features on a specific set of data.
# count_func: either count_freq or count_occur, used for indicating considering feature freq. or pres.
# data: the set of data to extract and select features (a pandas dataframe)
# col_feats: the name of the column in the data to extract features ('unigrams' or 'bigrams')
# top_feats: number of top features to select using Chi-squared
def train_select_feat( count_func, data, col_feats, top_feats ):

    dictvec = DictVectorizer()
    X = dictvec.fit_transform( count_func(tokens) for tokens in data[col_feats] )
    Y = np.array(data['label'])

    chi2, pval = feature_selection.chi2( X, Y )
    top = min( top_feats, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dictvec_selected = dictvec.restrict( fsel.get_support() )

    return X_selected, Y, dictvec_selected

# Extract unigram or bigram features on a specific set of data using a provided set of features.
# count_func: either count_freq or count_occur, used for indicating considering feature freq. or pres.
# data: the set of data to extract features (a pandas dataframe)
# col_feats: the name of the column in the data to extract features ('unigrams' or 'bigrams')
# dictvec: a set of features (a DictVectorizer)
def test_feat( count_func, data, col_feats, dictvec ):
    X = dictvec.transform( count_func(tokens) for tokens in data[col_feats] )
    Y = np.array(data['label'])
    return X, Y

In [None]:
from scipy.sparse import hstack

# Extract and select both unigram and bigram features on a specific set of data.
# count_func: either count_freq or count_occur, used for indicating considering feature freq. or pres.
# data: the set of data to extract and select features (a pandas dataframe)
# top_feats: number of top features to select using Chi-squared
def train_select_feat_unibi( count_func, data, top_feats ):
    X_uni, Y, dictvec_uni = train_select_feat( count_func, data, 'unigrams', top_feats )
    X_bi, Y, dictvec_bi = train_select_feat( count_func, data, 'bigrams', top_feats )
    return hstack((X_uni, X_bi)), Y, dictvec_uni, dictvec_bi

# Extract both unigram and bigram features on a specific set of data using a provided set of features.
# count_func: either count_freq or count_occur, used for indicating considering feature freq. or pres.
# data: the set of data to extract features (a pandas dataframe)
# dictvec_uni: a set of unigram features (a DictVectorizer)
# dictvec_bi: a set of bigram features (a DictVectorizer)
def test_feat_unibi( count_func, data, dictvec_uni, dictvec_bi ):
    X_uni, Y = test_feat( count_func, data, 'unigrams', dictvec_uni )
    X_bi, Y = test_feat( count_func, data, 'bigrams', dictvec_bi )
    return hstack((X_uni, X_bi)), Y

## Example 1: Train and Evaluate a Unigram Model (considering frequency)

Training Set: fold 2 & fold 3; Test Set: fold 1.

In [None]:
import sklearn
import sklearn.metrics
from sklearn.naive_bayes import MultinomialNB

data_train = data[data['fold']!=1]
data_test = data[data['fold']==1]

X_train, Y_train, dictvec = train_select_feat( count_freq, data_train, 'unigrams', 16165 )
X_test, Y_test = test_feat( count_freq, data_test, 'unigrams', dictvec )
classifier = sklearn.naive_bayes.MultinomialNB()
classifier.fit( X_train, Y_train )
Y_pred = classifier.predict(X_test)
readable_eval( Y_test, Y_pred, ['pos', 'neg'] )

{'Accuracy': 0.7532188841201717,
 'F1 (neg)': 0.7547974413646056,
 'F1 (pos)': 0.751619870410367,
 'Precision (neg)': 0.75,
 'Precision (pos)': 0.7565217391304347,
 'Recall (neg)': 0.759656652360515,
 'Recall (pos)': 0.7467811158798283}

## Example 2: Train and Evaluate a Unigram+Bigram Model (considering presence)

Training Set: fold 2 & fold 3; Test Set: fold 1.

In [None]:
import sklearn
import sklearn.metrics
from sklearn.naive_bayes import MultinomialNB

data_train = data[data['fold']!=1]
data_test = data[data['fold']==1]

X_train, Y_train, dictvec_uni, dictvec_bi = train_select_feat_unibi( count_occur, data_train, 16165 )
X_test, Y_test = test_feat_unibi( count_occur, data_test, dictvec_uni, dictvec_bi )
classifier = sklearn.naive_bayes.MultinomialNB()
classifier.fit( X_train, Y_train )
Y_pred = classifier.predict(X_test)
readable_eval( Y_test, Y_pred, ['pos', 'neg'] )

{'Accuracy': 0.778969957081545,
 'F1 (neg)': 0.7621247113163971,
 'F1 (pos)': 0.7935871743486974,
 'Precision (neg)': 0.825,
 'Precision (pos)': 0.7443609022556391,
 'Recall (neg)': 0.7081545064377682,
 'Recall (pos)': 0.8497854077253219}