<a href="https://colab.research.google.com/github/slee987/LIS640tmp/blob/main/lis640_week09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1 Set up Environment in Google Colab

Run the following cells to install/upgrade the required packages and check if the installed versions meet the requirements.

In [None]:
# Upgrade packages in Google Colab.

# upgrade pip
!pip3 install pip --upgrade

# upgrade spacy 3.0.x
!pip3 install 'spacy>=3.0.0,<3.1.0' --upgrade

# upgrade nltk 3.5
!pip3 install 'nltk>=3.5,<3.6' --upgrade

# upgrade scikit-learn 0.24.x
!pip3 install 'scikit-learn>=0.24.0,<0.25.0' --upgrade

# download the latest spacy model
!python3 -m spacy download en_core_web_sm

In [None]:
# Check the installed versions.
# We require: spacy == 3.0.x ; nltk == 3.5.x ; scikit-learn == 0.24.x ; spacy's en-core-web-sm == 3.0.x .

import re

pkgs = !pip3 list
versions = { pkg.split()[0]:pkg.split()[1] for pkg in pkgs if re.match( '(spacy\s.+)|(nltk\s.+)|(scikit-learn\s.+)|(en-core-web-sm\s.+)', pkg ) }

assert versions['spacy'][0:3] == '3.0'
assert versions['nltk'][0:3] == '3.5'
assert versions['scikit-learn'][0:4] == '0.24'
assert versions['en-core-web-sm'][0:3] == '3.0'

## 2 Explore the Dataset by Pang et al. (2002)

Download the raw dataset at:
http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip

In [None]:
import spacy

nlp = spacy.load( "en_core_web_sm", disable=["parser", "ner"] )

In [None]:
import pandas as pd

data = pd.read_csv( 'https://jiepujiang.github.io/data/pang2002.csv', index_col=0 )
# data = data[650:750]
data

## 3 Implement Pang et al. (2002)'s methods

https://www.cs.cornell.edu/home/llee/papers/sentiment.pdf

In [None]:
# Extract ungiram features (considering word frequencies): row (1)

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import feature_selection
from collections import Counter, OrderedDict

def count_unigram_freq( rawtext, nlp ):
    return [ token.lemma_.lower() for token in nlp(rawtext) if not token.is_stop and not token.is_punct ]

def train_feat_unigram_freq( data, nlp, top_feats ):

    dict_unigram = DictVectorizer()
    X = dict_unigram.fit_transform( Counter(count_unigram_freq(text, nlp)) for text in data['text'] )
    Y = np.array(data['label'])

    chi2, pval = feature_selection.chi2( X, Y )
    top = min( top_feats, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = dict_unigram.restrict( fsel.get_support() )

    return X_selected, Y, dict_selected

def test_feat_unigram_freq( data, nlp, dict_unigram ):
    X = dict_unigram.transform( Counter(count_unigram_freq(text, nlp)) for text in data['text'] )
    Y = np.array(data['label'])
    return X, Y

In [None]:
# Extract ungiram features (considering word occurrences only): row (2)

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import feature_selection
from collections import Counter, OrderedDict

def count_unigram_occur( rawtext, nlp ):
    return set([ token.lemma_.lower() for token in nlp(rawtext) if not token.is_stop and not token.is_punct ])

def train_feat_unigram_occur( data, nlp, top_feats ):

    dict_unigram = DictVectorizer()
    X = dict_unigram.fit_transform( Counter(count_unigram_occur(text, nlp)) for text in data['text'] )
    Y = np.array(data['label'])

    chi2, pval = feature_selection.chi2( X, Y )
    top = min( top_feats, X.shape[1] )
    fsel = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top )
    X_selected = fsel.fit_transform( X, Y )
    dict_selected = dict_unigram.restrict( fsel.get_support() )

    return X_selected, Y, dict_selected

def test_feat_unigram_occur( data, nlp, dict_unigram ):
    X = dict_unigram.transform( Counter(count_unigram_occur(text, nlp)) for text in data['text'] )
    Y = np.array(data['label'])
    return X, Y

In [None]:
# Extract ungiram+bigram features (considering word occurrences only): row (3)

import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction import DictVectorizer
from sklearn import feature_selection
from collections import Counter, OrderedDict

def count_bigram_occur( rawtext, nlp ):
    unigrams_withoov = [ '[OOV]' if token.is_stop or token.is_punct else token.lemma_.lower() for token in nlp(rawtext) ]
    return set([ unigrams_withoov[i]+'_'+unigrams_withoov[i+1] for i in range(len(unigrams_withoov)-1) if unigrams_withoov[i]!='[OOV]' and unigrams_withoov[i+1]!='[OOV]' ])

def train_feat_unibi_occur( data, nlp, top_feats ):

    dict_unigram = DictVectorizer()
    dict_bigram = DictVectorizer()
    X_unigram = dict_unigram.fit_transform( Counter(count_unigram_occur(text, nlp)) for text in data['text'] )
    X_bigram = dict_bigram.fit_transform( Counter(count_bigram_occur(text, nlp)) for text in data['text'] )
    Y = np.array(data['label'])

    chi2_unigram, pval_unigram = feature_selection.chi2( X_unigram, Y )
    top_unigram = min( top_feats, X_unigram.shape[1] )
    fsel_unigram = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top_unigram )
    X_unigram_selected = fsel_unigram.fit_transform( X_unigram, Y )
    dict_unigram_selected = dict_unigram.restrict( fsel_unigram.get_support() )

    chi2_bigram, pval_bigram = feature_selection.chi2( X_bigram, Y )
    top_bigram = min( top_feats, X_bigram.shape[1] )
    fsel_bigram = feature_selection.SelectKBest( score_func = feature_selection.chi2, k = top_bigram )
    X_bigram_selected = fsel_bigram.fit_transform( X_bigram, Y )
    dict_bigram_selected = dict_bigram.restrict( fsel_bigram.get_support() )

    return hstack((X_unigram_selected, X_bigram_selected)), Y, dict_unigram_selected, dict_bigram_selected

def test_feat_unibi_occur( data, nlp, dict_unigram, dict_bigram ):
    X_unigram = dict_unigram.transform( Counter(count_unigram_occur(text, nlp)) for text in data['text'] )
    X_bigram = dict_bigram.transform( Counter(count_bigram_occur(text, nlp)) for text in data['text'] )
    Y = np.array(data['label'])
    return hstack((X_unigram, X_bigram)), Y

## 4 Set Up Experiment

In [None]:
import sklearn
import sklearn.metrics
from sklearn.naive_bayes import MultinomialNB

def eval_nb_unigram_freq( data_train, data_test, top_feats ):
    X_train, Y_train, dict_unigram = train_feat_unigram_freq( data_train, nlp, top_feats )
    X_test, Y_test = test_feat_unigram_freq( data_test, nlp, dict_unigram )
    classifier = sklearn.naive_bayes.MultinomialNB()
    classifier.fit( X_train, Y_train )
    Y_pred = classifier.predict(X_test)
    return sklearn.metrics.precision_recall_fscore_support( Y_test, Y_pred, labels=['pos', 'neg'] )


In [None]:

metrics = eval_nb_unigram_freq( data[data['fold']!=1], data[data['fold']==1], 1000 )
metrics

## 5 In-class Exercise: Cross-Validation

train[1,2]->test[3]

train[1,3]->test[2]

train[2,3]->test[1]

In [None]:
# your solution here

