## Pattern Matching

In [1]:
# Imports
import pandas as pd
import itertools

from nltk import pos_tag, ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
# Load Data
stars_data = pd.read_csv("data/train_stars.csv", names=["Processed Review Text", "Review Text", "Stars"])
pol_data = pd.read_csv("data/train_polarity.csv", names=["Processed Review Text", "Review Text", "Polarity"])

### Pattern Recognition using POS Tagging

Tokenize reviews for POS Tagging

In [3]:
# Tokenize all reviews for pre-processing purposes
pol_data["Review Text"] = pol_data["Review Text"].apply(word_tokenize)
pol_data["Review Text"].head()

0    [i, was, not, able, to, complete, the, peer, g...
1    [language, was, understandable, enough, but, a...
2    [a, thorough, yet, concise, introduction, to, ...
3    [poor, labs, labs, do, not, teach, match, lect...
4                         [where, is, my, certificate]
Name: Review Text, dtype: object

Tag Reviews with POS

In [4]:
pol_data["Tagged Text"] = pol_data["Review Text"].apply(pos_tag)
pol_data["Tagged Text"].head()

0    [(i, NN), (was, VBD), (not, RB), (able, JJ), (...
1    [(language, NN), (was, VBD), (understandable, ...
2    [(a, DT), (thorough, JJ), (yet, RB), (concise,...
3    [(poor, JJ), (labs, NNS), (labs, NNS), (do, VB...
4    [(where, WRB), (is, VBZ), (my, PRP$), (certifi...
Name: Tagged Text, dtype: object

In [5]:
reviews_tags = []

for index in range(len(pol_data["Tagged Text"])):
    review_tags = []
    for item in pol_data["Tagged Text"][index]:
        review_tags.append(item[1])
    reviews_tags.append(review_tags)

pol_data["POS Tags"] = reviews_tags
pol_data["POS Tags"].head()

0    [NN, VBD, RB, JJ, TO, VB, DT, NN, VBD, NN, DT,...
1    [NN, VBD, JJ, RB, CC, VBG, NNS, MD, VB, JJ, IN...
2    [DT, JJ, RB, JJ, NN, TO, DT, NN, CC, TO, DT, I...
3    [JJ, NNS, NNS, VBP, RB, VB, NN, NNS, JJ, NN, T...
4                                 [WRB, VBZ, PRP$, NN]
Name: POS Tags, dtype: object

Find ngram patterns

In [6]:
def getGrams(data, condition, value, pos_tags):
    five_grams = []
    six_grams = []
    seven_grams = []

    for index in range(len(pos_tags)):
        if data[condition][index] == value:
            # Six Grams
            review_six_grams = list(ngrams(pos_tags[index], 6))
            six_grams.append(review_six_grams)

            # Five Grams
            review_five_grams = list(ngrams(pos_tags[index], 5))
            five_grams.append(review_five_grams)

            # Seven Grams
            review_seven_grams = list(ngrams(pos_tags[index], 7))
            seven_grams.append(review_seven_grams)

    flat_five_grams = [item for sublist in five_grams for item in sublist]
    flat_six_grams = [item for sublist in six_grams for item in sublist]
    flat_seven_grams = [item for sublist in seven_grams for item in sublist]

    unsorted_polarity_grams = list(itertools.chain(flat_five_grams, flat_six_grams, flat_seven_grams))
    
    return unsorted_polarity_grams

#### Pattern Recognition for polarity

In [7]:
unsorted_neutral_grams = getGrams(pol_data, "Polarity", "neutral", pol_data["POS Tags"])
neutral_grams = set(unsorted_neutral_grams)

In [8]:
unsorted_negative_grams = getGrams(pol_data, "Polarity", "negative", pol_data["POS Tags"])
negative_grams = set(unsorted_negative_grams)

In [9]:
unique_neutral = [x for x in unsorted_neutral_grams if x not in negative_grams]

mc_unique_neutral = list(Counter(unique_neutral).most_common(5000))
mc_unique_neutral.sort(key=lambda x:x[1], reverse=True)
mc_unique_neutral[:5]

[(('VB', 'VBN', 'JJR', 'JJ', 'NN'), 23),
 (('MD', 'VB', 'VBN', 'JJR', 'JJ', 'NN'), 22),
 (('DT', 'JJ', 'NN', 'MD', 'VB', 'JJR'), 18),
 (('JJ', 'IN', 'IN', 'PRP', 'VBD'), 17),
 (('NN', 'VBD', 'IN', 'EX', 'VBD'), 17)]

In [10]:
unique_negative = [x for x in unsorted_negative_grams if x not in neutral_grams]

mc_unique_negative = list(Counter(unique_negative).most_common(5000))
mc_unique_negative.sort(key=lambda x:x[1], reverse=True)
mc_unique_negative[:5]

[(('NN', 'CD', 'RB', 'CD', 'NNS'), 90),
 (('CD', 'RB', 'CD', 'NNS', 'TO'), 90),
 (('NN', 'CD', 'RB', 'CD', 'NNS', 'TO'), 90),
 (('CD', 'RB', 'CD', 'NNS', 'TO', 'VB'), 90),
 (('NN', 'CD', 'RB', 'CD', 'NNS', 'TO', 'VB'), 90)]

In [11]:
def unique_patterns(most_common):
    unique_patterns = []
    for item in most_common:
        unique_patterns.append(list(item[0]))
    return unique_patterns

In [12]:
neutral_patterns = unique_patterns(mc_unique_neutral)
negative_patterns = unique_patterns(mc_unique_negative)

In [13]:
patterns_df = pd.DataFrame(
    data = {
        "neutral_patterns": neutral_patterns,
        "negative_patterns": negative_patterns
    }
)

patterns_df.to_csv('data/polarity_patterns.csv', index=False, header=False)

#### Pattern recognition for star rating

Tokenize reviews for POS Tagging

In [14]:
# Tokenize all reviews for pre-processing purposes
stars_data["Review Text"] = stars_data["Review Text"].apply(word_tokenize)
stars_data["Review Text"].head()

0                       [buddhism, conversion, course]
1    [much, respect, for, richard, shell, he, talks...
2    [nothing, practical, just, talking, and, lots,...
3    [i, enjoyed, the, videos, but, not, the, assig...
4    [the, course, is, well, structured, with, enou...
Name: Review Text, dtype: object

Tag Reviews with POS

In [15]:
stars_data["Tagged Text"] = stars_data["Review Text"].apply(pos_tag)
stars_data["Tagged Text"].head()

0     [(buddhism, NN), (conversion, NN), (course, NN)]
1    [(much, JJ), (respect, NN), (for, IN), (richar...
2    [(nothing, NN), (practical, JJ), (just, RB), (...
3    [(i, NN), (enjoyed, VBD), (the, DT), (videos, ...
4    [(the, DT), (course, NN), (is, VBZ), (well, RB...
Name: Tagged Text, dtype: object

In [16]:
reviews_tags = []

for index in range(len(stars_data["Tagged Text"])):
    review_tags = []
    for item in stars_data["Tagged Text"][index]:
        review_tags.append(item[1])
    reviews_tags.append(review_tags)

stars_data["POS Tags"] = reviews_tags
stars_data["POS Tags"].head()

0                                         [NN, NN, NN]
1    [JJ, NN, IN, NN, NN, PRP, VBZ, CC, VBZ, PRP, P...
2                  [NN, JJ, RB, VBG, CC, NNS, IN, VBG]
3    [NN, VBD, DT, NNS, CC, RB, DT, NNS, RB, VBD, T...
4    [DT, NN, VBZ, RB, VBN, IN, JJ, NN, VBN, IN, DT...
Name: POS Tags, dtype: object

In [17]:
unsorted_onestar_grams = getGrams(stars_data, "Stars", 1, stars_data["POS Tags"])
onestar_grams = set(unsorted_onestar_grams)

In [18]:
unsorted_twostar_grams = getGrams(stars_data, "Stars", 2, stars_data["POS Tags"])
twostar_grams = set(unsorted_twostar_grams)

In [19]:
unsorted_threestar_grams = getGrams(stars_data, "Stars", 3, stars_data["POS Tags"])
threestar_grams = set(unsorted_threestar_grams)

In [20]:
unsorted_fourstar_grams = getGrams(stars_data, "Stars", 4, stars_data["POS Tags"])
fourstar_grams = set(unsorted_fourstar_grams)

In [21]:
unsorted_fivestar_grams = getGrams(stars_data, "Stars", 5, stars_data["POS Tags"])
fivestar_grams = set(unsorted_fivestar_grams)

In [22]:
unique_onestar = [x for x in unsorted_onestar_grams if x not in twostar_grams]

mc_unique_onestar = list(Counter(unique_onestar).most_common(2000))
mc_unique_onestar.sort(key=lambda x:x[1], reverse=True)
mc_unique_onestar[:5]

[(('NN', 'CD', 'RB', 'CD', 'NNS'), 99),
 (('CD', 'RB', 'CD', 'NNS', 'TO'), 99),
 (('NN', 'CD', 'RB', 'CD', 'NNS', 'TO'), 99),
 (('CD', 'RB', 'CD', 'NNS', 'TO', 'VB'), 99),
 (('NN', 'CD', 'RB', 'CD', 'NNS', 'TO', 'VB'), 99)]

In [23]:
unique_twostar = [x for x in unsorted_twostar_grams if x not in onestar_grams and x not in threestar_grams]

mc_unique_twostar = list(Counter(unique_twostar).most_common(2000))
mc_unique_twostar.sort(key=lambda x:x[1], reverse=True)
mc_unique_twostar[:5]

[(('CD', 'CD', 'CD', 'CD', 'CD'), 24),
 (('CD', 'CD', 'CD', 'CD', 'CD', 'CD'), 21),
 (('CD', 'CD', 'CD', 'CD', 'CD', 'CD', 'CD'), 18),
 (('VB', 'NN', 'RB', 'RB', 'IN'), 14),
 (('VBD', 'VBN', 'RB', 'IN', 'PRP'), 13)]

In [24]:
unique_threestar = [x for x in unsorted_threestar_grams if x not in twostar_grams and x not in fourstar_grams]

mc_unique_threestar = list(Counter(unique_threestar).most_common(2000))
mc_unique_threestar.sort(key=lambda x:x[1], reverse=True)
mc_unique_threestar[:5]

[(('JJ', 'NN', 'IN', 'VBN', 'TO'), 9),
 (('NN', 'MD', 'VB', 'NNS', 'VB'), 8),
 (('VBP', 'NN', 'CC', 'DT', 'NN'), 8),
 (('NN', 'NNS', 'CC', 'VBP', 'TO'), 8),
 (('NN', 'RB', 'IN', 'PRP', 'PRP'), 8)]

In [25]:
unique_fourstar = [x for x in unsorted_fourstar_grams if x not in threestar_grams and x not in fivestar_grams]

mc_unique_fourstar = list(Counter(unique_fourstar).most_common(2000))
mc_unique_fourstar.sort(key=lambda x:x[1], reverse=True)
mc_unique_fourstar[:5]

[(('VB', 'DT', 'NN', 'DT', 'CD'), 9),
 (('TO', 'VB', 'DT', 'NNS', 'PRP', 'VBP'), 9),
 (('VB', 'JJ', 'TO', 'VB', 'JJR', 'NNS'), 9),
 (('NNS', 'RB', 'VBP', 'RB', 'VBN'), 7),
 (('PRP', 'CD', 'NNS', 'RB', 'IN'), 7)]

In [26]:
unique_fivestar = [x for x in unsorted_fivestar_grams if x not in fourstar_grams]

mc_unique_fivestar = list(Counter(unique_fivestar).most_common(2000))
mc_unique_fivestar.sort(key=lambda x:x[1], reverse=True)
mc_unique_fivestar[:5]

[(('FW', 'FW', 'FW', 'FW', 'FW', 'FW', 'FW'), 24),
 (('NNP', 'NNP', 'NNP', 'NNP', 'NNP'), 19),
 (('VBD', 'CD', 'IN', 'DT', 'JJS'), 18),
 (('JJ', 'NN', 'NN', 'PRP', 'TO'), 17),
 (('NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP'), 16)]

In [27]:
onestar_patterns = unique_patterns(mc_unique_onestar)
twostar_patterns = unique_patterns(mc_unique_twostar)
threestar_patterns = unique_patterns(mc_unique_threestar)
fourstar_patterns = unique_patterns(mc_unique_fourstar)
fivestar_patterns = unique_patterns(mc_unique_fivestar)

In [28]:
star_patterns_df = pd.DataFrame(
    data = {
        "onestar_patterns": onestar_patterns,
        "twostar_patterns": twostar_patterns,
        "threestar_patterns": threestar_patterns,
        "fourstar_patterns": fourstar_patterns,
        "fivestar_patterns": fivestar_patterns,
    }
)

star_patterns_df.to_csv('data/star_patterns.csv', index=False, header=False)