In [11]:
import pandas as pd
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [12]:
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

In [13]:
print(df.head(3))
print(df.tail(3))

                                          reviewText  Positive
0  This is a one of the best apps acording to a b...         1
1  This is a pretty good version of the game for ...         1
2  this is a really cool game. there are a bunch ...         1
                                              reviewText  Positive
19997  love it!  this game. is awesome. wish it had m...         1
19998  I love love love this app on my side of fashio...         1
19999  This game is a rip off. Here is a list of thin...         0


In [14]:
df['reviewText']

0        This is a one of the best apps acording to a b...
1        This is a pretty good version of the game for ...
2        this is a really cool game. there are a bunch ...
3        This is a silly game and can be frustrating, b...
4        This is a terrific game on any pad. Hrs of fun...
                               ...                        
19995    this app is fricken stupid.it froze on the kin...
19996    Please add me!!!!! I need neighbors! Ginger101...
19997    love it!  this game. is awesome. wish it had m...
19998    I love love love this app on my side of fashio...
19999    This game is a rip off. Here is a list of thin...
Name: reviewText, Length: 20000, dtype: object

In [15]:
df['Positive']

0        1
1        1
2        1
3        1
4        1
        ..
19995    0
19996    1
19997    1
19998    1
19999    0
Name: Positive, Length: 20000, dtype: int64

In [16]:
df.loc[0]

reviewText    This is a one of the best apps acording to a b...
Positive                                                      1
Name: 0, dtype: object

In [17]:
type(df.loc[0])

pandas.core.series.Series

In [18]:
df.loc[0][0]

'This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff'

In [19]:
df.loc[0][1]

1

In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

In [21]:
def preprocess(text, n):

    # To lower case and tokenization
    tokens = word_tokenize(text.lower())

    # Stop word and punctuation removal
    filtered_tokens = [token for token in tokens if token.isalpha() and (token not in stopwords.words('english'))]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # NGram generation
    ngram_set = set()
    
    for i in range(1, n + 1):
        processed_text = ngrams(lemmatized_tokens, i)
        ngram_set.update([' '.join(grams) for grams in processed_text])

    return ngram_set

In [22]:
df['reviewText'] = df['reviewText'].apply(preprocess, args=(3,))

In [23]:
print(df.loc[0][0])

{'king', 'one best apps', 'realustic stuff', 'king pig realustic', 'tnt', 'apps acording bunch', 'king pig', 'bunch', 'best', 'tnt king pig', 'one', 'bomb', 'people agree', 'agree bomb egg', 'best apps acording', 'acording bunch', 'bomb egg pig', 'bomb egg', 'one best', 'egg', 'apps acording', 'best apps', 'tnt king', 'pig realustic stuff', 'pig tnt king', 'stuff', 'bunch people agree', 'acording', 'bunch people', 'apps', 'realustic', 'egg pig tnt', 'people', 'pig tnt', 'agree bomb', 'people agree bomb', 'pig', 'egg pig', 'acording bunch people', 'pig realustic', 'agree'}


In [24]:
from collections import defaultdict

In [25]:
def build_dictionary(df, attribute):
    dictionary = defaultdict(set)
    
    for idx, record in enumerate(df[attribute]):
        for token in record:
            dictionary[token].add(idx)
    
    return dictionary

In [26]:
dictionary = build_dictionary(df, 'reviewText')

print(list(dictionary.keys())[:10])
print()
print('Number of distinct tokens:', len(dictionary))
print('Number of records:', len(df))
print()
print('Number of data:', "{:,}".format(len(dictionary) * len(df)))

['king', 'one best apps', 'realustic stuff', 'king pig realustic', 'tnt', 'apps acording bunch', 'king pig', 'bunch', 'best', 'tnt king pig']

Number of distinct tokens: 455309
Number of records: 20000

Number of data: 9,106,180,000


In [27]:
df_dict = pd.DataFrame(zip(dictionary.keys(), dictionary.values()), columns=['Tokens', 'Messages'])
count = []

for mess in df_dict['Messages']:
    count.append(len(mess))

df_dict['Count'] = count

In [28]:
df_dict.head()

Unnamed: 0,Tokens,Messages,Count
0,king,"{0, 8960, 8961, 7556, 15879, 2953, 5643, 7437,...",33
1,one best apps,"{0, 7939, 14084, 8197, 11654, 6923, 14, 3214, ...",29
2,realustic stuff,{0},1
3,king pig realustic,{0},1
4,tnt,{0},1


In [29]:
(df_dict['Count'] == 1).astype(int).sum()

404224

In [30]:
singletons = df_dict.loc[df_dict['Count'] == 1]

In [31]:
def remove_singletons(df, singletons):
    for _, (token, messages, _) in singletons.iterrows():
        for message in messages:
            df['reviewText'].loc[message].remove(token)

In [32]:
remove_singletons(df, singletons)

In [33]:
print(df.loc[0][0])

dictionary = build_dictionary(df, 'reviewText')

print()
print('Number of distinct tokens:', len(dictionary))
print('Number of records:', len(df))
print()
print('Number of data:', "{:,}".format(len(dictionary) * len(df)))

{'king', 'one best apps', 'bunch', 'best', 'one', 'bomb', 'one best', 'egg', 'best apps', 'stuff', 'bunch people', 'apps', 'people', 'pig', 'agree'}

Number of distinct tokens: 51085
Number of records: 20000

Number of data: 1,021,700,000


In [34]:
import numpy as np

In [35]:
def entropy(df):
    pos = df['Positive'].sum()
    neg = len(df) - pos
    
    p_pos = pos / len(df)
    p_neg = neg / len(df)
    
    h = 0
    
    for p in [p_pos, p_neg]:
        h -= 0 if p == 0 else p * np.log(p)
    
    return h

In [36]:
H = entropy(df)
print(H)

0.5491705321540394


In [37]:
ig = dict()

for token in dictionary:
    messages = list(dictionary[token])
    df_with_token = df.loc[messages]
    df_without_token = df.drop(df.loc[messages].index)
    
    ig[token] = H - len(df_with_token)/len(df) * entropy(df_with_token)
    ig[token] -= len(df_without_token)/len(df) * entropy(df_without_token)

In [38]:
sorted_features = sorted(ig.items(), key=lambda x:x[1], reverse=True)
threshold = 300

print(sorted_features[:threshold])

[('love', 0.03423713068659612), ('great', 0.02749101823553607), ('waste', 0.019433481851488965), ('easy', 0.018721601510420016), ('waste time', 0.01175151052995782), ('stupid', 0.009776049267213538), ('deleted', 0.009342441292153891), ('uninstalled', 0.008834937633378614), ('suck', 0.008764031035517927), ('easy use', 0.008499977811319726), ('use', 0.007622975410377264), ('boring', 0.007437891001108254), ('fun', 0.007197578226357582), ('best', 0.007026242071018096), ('worst', 0.006905205111989221), ('awesome', 0.006384358984120819), ('even', 0.006133812595569221), ('love app', 0.00584812569778792), ('useless', 0.005666189457107507), ('fix', 0.005224734309171808), ('would', 0.0051424909352617565), ('horrible', 0.005061346636603292), ('work great', 0.004817497581510843), ('nothing', 0.004798996375906084), ('bad', 0.004672398085784124), ('lot', 0.004449429711658404), ('great app', 0.004198364801874521), ('dumb', 0.00408293414032912), ('alarm', 0.004010418731947851), ('terrible', 0.00396834

In [39]:
best_features = set()

for i in range(threshold):
    best_features.add(sorted_features[i][0])

In [40]:
def remove_useless_features(df, best_features):
    empty_records = 0
    new_reviewText = []
    
    for tokens in df['reviewText']:
        intersect = tokens.intersection(best_features)
        new_reviewText.append(intersect)
        
        if len(intersect) == 0:
            empty_records += 1
    
    df['reviewText'] = new_reviewText
    
    return empty_records

In [41]:
empty_records = remove_useless_features(df, best_features)
print(empty_records)

365


In [42]:
df.loc[0]

reviewText    {best, one best}
Positive                     1
Name: 0, dtype: object

In [43]:
def to_numeric(df: pd.DataFrame, best_features, threshold):
    index = dict()

    for idx, feature in enumerate(best_features):
        index[feature] = idx
    
    data = np.zeros((len(df), threshold))
    target = np.empty(len(df))
    
    for idx, record in df.iterrows():
        for token in record[0]:
            data[idx, index[token]] = 1
        
        target[idx] = record[1]
    
    return data, target

In [44]:
data, target = to_numeric(df, best_features, threshold)

In [45]:
data[0].sum()

2.0

In [46]:
target[0]

1.0

In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [48]:
clf = DecisionTreeClassifier()
res = cross_val_score(clf, data, target, cv=10)

In [49]:
print('Accuracies:', res)
print('Average accuracy:', res.mean())
print('Standard deviation:', res.std())

Accuracies: [0.804  0.788  0.81   0.8095 0.775  0.803  0.8075 0.791  0.8    0.8175]
Average accuracy: 0.80055
Standard deviation: 0.011936184482488527


In [50]:
from sklearn.naive_bayes import MultinomialNB

In [51]:
clf = MultinomialNB()
res = cross_val_score(clf, data, target, cv=10)

In [52]:
print('Accuracies:', res)
print('Average accuracy:', res.mean())
print('Standard deviation:', res.std())

Accuracies: [0.8775 0.879  0.875  0.875  0.8445 0.8635 0.8865 0.857  0.8955 0.874 ]
Average accuracy: 0.8727500000000001
Standard deviation: 0.013826152754833843


In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
clf = RandomForestClassifier()
res = cross_val_score(clf, data, target, cv=10)

In [55]:
print('Accuracies:', res)
print('Average accuracy:', res.mean())
print('Standard deviation:', res.std())

Accuracies: [0.858  0.846  0.862  0.8555 0.8245 0.838  0.86   0.855  0.8685 0.8665]
Average accuracy: 0.8533999999999999
Standard deviation: 0.01293213052826178


In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
clf = LogisticRegression()
res = cross_val_score(clf, data, target, cv=10)

In [58]:
print('Accuracies:', res)
print('Average accuracy:', res.mean())
print('Standard deviation:', res.std())

Accuracies: [0.884  0.8895 0.8765 0.8735 0.855  0.864  0.8805 0.87   0.896  0.894 ]
Average accuracy: 0.8783
Standard deviation: 0.01251039567719583
