In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from scipy.sparse import hstack
import warnings

import xml.etree.ElementTree as ET
import numpy as np
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from string import punctuation

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# path = 'drive/MyDrive/CS4248 group project/ABSA_data/Restaurants_Train.xml'
# path_test = 'drive/MyDrive/CS4248 group project/ABSA_data/Restaurants_Test.xml'
path = 'drive/MyDrive/CS4248 group project/ABSA_data/Laptops_Train.xml'
path_test = 'drive/MyDrive/CS4248 group project/ABSA_data/Laptops_Test.xml'
def load_data(path):
  data = []
  with open(path, 'r') as xml_file:
    tree = ET.parse(xml_file)
    sentences = tree.getroot()

    for sent in sentences:
      record = dict()
      record["id"] = sent.attrib['id']
      record["text"] = sent.findall(".//text")[0].text
      record["aspectTerms"] = []
      record["aspectCats"] = []

      aspectTerms = sent.findall(".//aspectTerms")
      aspectCats = sent.findall(".//aspectCategories")
    
      if aspectTerms:
        record["aspectTerms"] = [term.attrib for term in sent.findall(".//aspectTerms")[0]]
      if aspectCats:
        record["aspectCats"] = [cat.attrib for cat in sent.findall(".//aspectCategories")[0]]

      data.append(record)
    return pd.DataFrame(data)

In [4]:
train_dataframe = load_data(path)
test_dataframe =load_data(path_test)

In [5]:
train_text_list = train_dataframe['text']
train_aspects_list = list(train_dataframe['aspectTerms'])
print(train_text_list.head())
print(train_aspects_list[:5])

0    I charge it at night and skip taking the cord ...
1    I bought a HP Pavilion DV4-1222nr laptop and h...
2    The tech guy then said the service center does...
3    I investigated netbooks and saw the Toshiba NB...
4    The other day I had a presentation to do for a...
Name: text, dtype: object
[[{'term': 'cord', 'polarity': 'neutral', 'from': '41', 'to': '45'}, {'term': 'battery life', 'polarity': 'positive', 'from': '74', 'to': '86'}], [], [{'term': 'service center', 'polarity': 'negative', 'from': '27', 'to': '41'}, {'term': '"sales" team', 'polarity': 'negative', 'from': '109', 'to': '121'}, {'term': 'tech guy', 'polarity': 'neutral', 'from': '4', 'to': '12'}], [], []]


In [6]:
def pos_tag(review):
    tagged_text_list = []
    # pos_tagger = POSTagger()
    for text in review:
        tagged_text_list.append(pos_tagger(text))
    return tagged_text_list

def filter_tag(tagged_reviews):
    filtered_list = []
    # pos_tagger = POSTagger()
    for tagged_review in tagged_reviews:
        filtered_list.append(filter_pos_tag(tagged_review))
    return filtered_list

In [7]:
def pos_tagger(text):
      text = nltk.word_tokenize(text)
      stopwords_en = stopwords.words('english')
      stopwords_en_withpunct = set(stopwords_en).union(set(punctuation))
      text = [word for word in text if word not in stopwords_en_withpunct]
      tagged_pos_list = nltk.pos_tag(text)
      return tagged_pos_list

def filter_pos_tag(tagged_text):
      final_text_list = []
      matching_tag = ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']

      for word, tag in tagged_text:
          final_text = []
          if tag in matching_tag:
              final_text.append(word)
              final_text_list.append((' '.join(final_text)))
      return ' '.join(final_text_list)


In [8]:
tagged_text_list_train = pos_tag(train_text_list)

In [9]:
final_train_text_list = filter_tag(tagged_text_list_train)

print(final_train_text_list[:5])

['charge night skip taking cord good battery life', 'bought HP Pavilion DV4-1222nr laptop many problems computer', 'tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook', 'investigated netbooks saw Toshiba NB305-N410BL', 'day presentation seminar large conference town- lots people little time prep set computer projector etc']


In [10]:
#Selecting only 20 most common aspect.
def get_most_common_aspect(aspect_list):
    import nltk
    aspect_terms = []

    aspect_list = list(aspect_list.aspectTerms)

    for inner_list in aspect_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                aspect_terms.append(_dict.get('term'))

    most_common_aspect = [k for k, v in nltk.FreqDist(aspect_terms).most_common(1000)]
    return most_common_aspect

In [11]:
# generate data frame
def get_data_frame(text_list, train_aspects_list, most_common_aspect):
    # data = {'Text': text_list}
    df = pd.DataFrame(columns=most_common_aspect)
    df["Text"] = text_list
    for inner_list in train_aspects_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                if _dict.get('term') in most_common_aspect:
                    df.loc[train_aspects_list.index(inner_list), _dict.get('term')] = 1.0
    return df

In [12]:
# generate data frame for aspect extraction task
def get_aspect_data_frame(df, most_common_aspect):
  
    for common_aspect in most_common_aspect:
        df[common_aspect] = df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])
    df = df.fillna(0)
    return df

In [13]:
most_common_aspect = get_most_common_aspect(train_dataframe)

In [14]:
df_train = get_data_frame(final_train_text_list,train_aspects_list, most_common_aspect)
df_train_aspect = get_aspect_data_frame(df_train, most_common_aspect)
df_train_aspect = df_train_aspect.reindex(sorted(df_train_aspect.columns), axis=1)

In [15]:
test_text_list = test_dataframe['text']
test_aspects_list = list(test_dataframe['aspectTerms'])

In [16]:
tagged_text_list_test = pos_tag(test_text_list)

In [17]:
final_test_text_list = filter_tag(tagged_text_list_test)

In [18]:
df_test= get_data_frame(final_test_text_list,test_aspects_list, most_common_aspect)
df_test_aspect= get_aspect_data_frame(df_test, most_common_aspect)
df_test_aspect = df_test_aspect.reindex(sorted(df_test_aspect.columns), axis=1)

In [19]:
df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_train= df_train_aspect.Text
y_train = df_train_aspect.drop('Text', 1)

df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_test = df_test_aspect.Text
y_test = df_test_aspect.drop('Text', 1)
final_most_common_aspect = list(y_train)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [20]:
y_train = np.asarray(y_train, dtype=np.int64)
y_test = np.asarray(y_test, dtype=np.int64)
print(y_train[:5])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [21]:
# Generate word vecotors using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from nltk import word_tokenize          
# from nltk.stem import WordNetLemmatizer 
vect = CountVectorizer(max_df=1.0, stop_words='english') 
# vect = TfidfVectorizer(max_df=1.0, stop_words='english')   
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [22]:
%%time
nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)

CPU times: user 2.08 s, sys: 44 ms, total: 2.12 s
Wall time: 2.12 s


In [23]:
%%time
C = 1.0 
svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)

CPU times: user 13.5 s, sys: 151 ms, total: 13.6 s
Wall time: 13.7 s


In [24]:
%%time
lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)

CPU times: user 3.74 s, sys: 160 ms, total: 3.9 s
Wall time: 3.89 s


In [25]:
%%time
sgd = OneVsRestClassifier(SGDClassifier(max_iter=1000)).fit(X_train_dtm,y_train)

CPU times: user 3.26 s, sys: 23.1 ms, total: 3.28 s
Wall time: 3.29 s


In [26]:
# Predict the test data using classifiers
y_pred_class = nb_classif.predict(X_test_dtm)
y_pred_class_svc = svc.predict(X_test_dtm)
y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)
y_pred_class_sgd = sgd.predict(X_test_dtm)

In [27]:
from sklearn import metrics

In [28]:
print(metrics.accuracy_score(y_test,y_pred_class))
print(metrics.accuracy_score(y_test,y_pred_class_svc))
print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))
print(metrics.accuracy_score(y_test,y_pred_class_sgd))

0.685
0.73875
0.73875
0.69


In [29]:
print(metrics.precision_score(y_test,y_pred_class,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))

0.8333333333333334
0.6735751295336787
0.6572769953051644
0.5016949152542373


In [30]:
print(metrics.recall_score(y_test,y_pred_class,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))

0.014534883720930232
0.37790697674418605
0.4069767441860465
0.43023255813953487


In [31]:
print(metrics.f1_score(y_test,y_pred_class,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))

0.02857142857142857
0.48417132216014896
0.5026929982046678
0.4632237871674491
