In [1]:
import pandas as pd

DATA_PATH: str = '../data'

In [2]:
sample: pd.DataFrame = pd.read_csv(f'{DATA_PATH}/imdb.eval.csv')

print(sample.head(4))
print(sample.describe(include='all'))

                                              review sentiment
0  Leonard Rossiter and Frances de la Tour carry ...  negative
1  A fashion designer trips over a cat and falls ...  negative
2  Personally, I think the movie is pretty good. ...  positive
3  <br /><br />As usual, I was really looking for...  negative
                                                   review sentiment
count                                                5000      5000
unique                                               4998         2
top     this is the worst film I've seen in a long lon...  negative
freq                                                    2      2515


In [3]:
# remove duplicates
sample.drop_duplicates(inplace=True)
print(sample.describe(include='all'))

                                                   review sentiment
count                                                4998      4998
unique                                               4998         2
top     Leonard Rossiter and Frances de la Tour carry ...  negative
freq                                                    1      2513


## Tokenize Text

In [4]:
import string

# tokenizer text convert to lowercase
def tokenize(column: pd.Series) -> pd.Series:

    return column\
    .str.translate(str.maketrans('', '', string.punctuation))\
    .str.replace(r'<[^<>]*>', '', regex=True)\
    .str.lower()\
    .str.split()

In [5]:
sample['token.uni_gram'] = tokenize(sample['review'])
print(sample.describe(include='all'))

                                                   review sentiment  \
count                                                4998      4998   
unique                                               4998         2   
top     Leonard Rossiter and Frances de la Tour carry ...  negative   
freq                                                    1      2513   

                                           token.uni_gram  
count                                                4998  
unique                                               4998  
top     [leonard, rossiter, and, frances, de, la, tour...  
freq                                                    1  


In [6]:
from collections import Counter

def calculate_absolute_frequencies(data: pd.DataFrame) -> dict:

    sentiment_token: dict = {}

    # iterate over each sentiment
    for label, group in data.groupby('sentiment'):

        # count most common words (absolute frequencies)
        count: pd.DataFrame = pd.DataFrame.from_records(
            list(dict(Counter(list(group['token.uni_gram'].explode()))).items()),
            columns=['token','n']
        )

        # filter stop words
        # count = count[~count['token'].isin(stop_words)]

        sentiment_token[label] = count

    return sentiment_token

In [7]:
#
def log_frequencies(data: dict, n:int = 16) -> None:
    for sentiment, count in data.items():
        print(f'name({sentiment}) || len({len(count)}) || sum({sum(count["n"])}) \n {count.head(n)}')

In [8]:
#
def get_most_common(polarity_words: dict, l: int = 1024) -> dict:

    common_polarity_words: dict = {}

    for sentiment, count in polarity_words.items():
        common_polarity_words[sentiment] = count.sort_values(by=['n'], ascending=False).head(l)

    return common_polarity_words

In [9]:
#
def remove_shared(polarity_words: dict):

    filtered_polarity_words: dict = {}

    all_words: list = [list(count["token"]) for _, count in polarity_words.items()]

    words_intersection: set = set.intersection(*map(set, all_words))

    for sentiment, count in polarity_words.items():
        filtered_polarity_words[sentiment] = count[~count['token'].isin(words_intersection)]


    return filtered_polarity_words

In [10]:
def calculate_relative_frequencies(polarity_words: dict) -> None:
    for sentiment, count in polarity_words.items():
            count['p'] = count['n'] / sum(count["n"])

In [11]:
sample_uni_gram_frequencies: dict = calculate_absolute_frequencies(sample)
most_common_frequencies: dict = get_most_common(sample_uni_gram_frequencies, 2048)
filtered_most_common_frequencies: dict = remove_shared(most_common_frequencies)
selected_most_common_frequencies: dict = get_most_common(filtered_most_common_frequencies, 64)
calculate_relative_frequencies(selected_most_common_frequencies)

log_frequencies(selected_most_common_frequencies)

name(negative) || len(64) || sum(4492) 
           token    n         p
1695      awful  315  0.070125
684       waste  266  0.059216
1536     poorly  123  0.027382
1898       lame  122  0.027159
375       badly  115  0.025601
1588     wasted   96  0.021371
912       fails   91  0.020258
2083       joke   91  0.020258
632        dumb   91  0.020258
677        fake   90  0.020036
6459   pathetic   89  0.019813
2504       mess   87  0.019368
4345    garbage   82  0.018255
4039  laughable   78  0.017364
2201  pointless   77  0.017142
1865     bother   75  0.016696
name(positive) || len(64) || sum(3353) 
             token   n         p
2990       superb  96  0.028631
826       journey  93  0.027736
2747       subtle  75  0.022368
1419     stunning  71  0.021175
758   beautifully  71  0.021175
1016    portrayal  70  0.020877
2194  outstanding  67  0.019982
3741     terrific  65  0.019386
498      feelings  65  0.019386
772      marriage  62  0.018491
557      touching  62  0.018491
3074   

In [12]:
def write_frequencies(path: str, polarity_words: dict) -> None:
    writer = pd.ExcelWriter(path)

    for i, (label, df) in enumerate(polarity_words.items()):
        df.to_excel(writer, label)
    writer.save()

In [15]:
write_frequencies(f'{DATA_PATH}/linguistic_model/custom.classifier.polarity_words.uni_gram.xlsx', selected_most_common_frequencies)