In [3]:
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk import bigrams
from nltk import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from textblob import TextBlob
from nltk.corpus import stopwords


In [4]:
pd.set_option('display.max_colwidth', None)
DataFrame = pd.read_csv('/content/imdb_labelled.csv')
DataFrame.head()

Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1


In [5]:
DataFrame['label'].value_counts()

1    386
0    362
Name: label, dtype: int64

In [6]:
DataFrame.columns.tolist()

['text', 'label']

In [7]:
DataFrame.text[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [None]:
nltk.download('punkt')

In [10]:
word_tokenize(DataFrame.text[0])

['A',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [11]:
Sample_tokens = word_tokenize(DataFrame.text[0])

In [12]:
list(bigrams(Sample_tokens))

[('A', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', 'slow-moving'),
 ('slow-moving', ','),
 (',', 'aimless'),
 ('aimless', 'movie'),
 ('movie', 'about'),
 ('about', 'a'),
 ('a', 'distressed'),
 ('distressed', ','),
 (',', 'drifting'),
 ('drifting', 'young'),
 ('young', 'man'),
 ('man', '.')]

In [13]:
sample_bitokens = list(bigrams(Sample_tokens))
print(sample_bitokens)

[('A', 'very'), ('very', ','), (',', 'very'), ('very', ','), (',', 'very'), ('very', 'slow-moving'), ('slow-moving', ','), (',', 'aimless'), ('aimless', 'movie'), ('movie', 'about'), ('about', 'a'), ('a', 'distressed'), ('distressed', ','), (',', 'drifting'), ('drifting', 'young'), ('young', 'man'), ('man', '.')]


In [14]:
sample_bitokens

[('A', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', 'slow-moving'),
 ('slow-moving', ','),
 (',', 'aimless'),
 ('aimless', 'movie'),
 ('movie', 'about'),
 ('about', 'a'),
 ('a', 'distressed'),
 ('distressed', ','),
 (',', 'drifting'),
 ('drifting', 'young'),
 ('young', 'man'),
 ('man', '.')]

In [16]:
FreqDist(Sample_tokens)

FreqDist({',': 4, 'very': 3, 'A': 1, 'slow-moving': 1, 'aimless': 1, 'movie': 1, 'about': 1, 'a': 1, 'distressed': 1, 'drifting': 1, ...})

In [17]:
sample_freqdist = FreqDist(Sample_tokens)
print(sample_freqdist)

<FreqDist with 13 samples and 18 outcomes>


In [18]:
sample_freqdist

FreqDist({',': 4, 'very': 3, 'A': 1, 'slow-moving': 1, 'aimless': 1, 'movie': 1, 'about': 1, 'a': 1, 'distressed': 1, 'drifting': 1, ...})

In [19]:
sample_freqdist.most_common(10)

[(',', 4),
 ('very', 3),
 ('A', 1),
 ('slow-moving', 1),
 ('aimless', 1),
 ('movie', 1),
 ('about', 1),
 ('a', 1),
 ('distressed', 1),
 ('drifting', 1)]

In [20]:
def top_n(text, n):
    tokens = word_tokenize(text)
    freqdist = FreqDist(tokens)
    return freqdist.most_common(n)

top_n(DataFrame.text[2], 10)

[('and', 3),
 ('the', 3),
 ('-', 2),
 ('Attempting', 1),
 ('artiness', 1),
 ('with', 1),
 ('black', 1),
 ('&', 1),
 ('white', 1),
 ('clever', 1)]

In [21]:
top_n(DataFrame.text[1], 10)

[('the', 2),
 ('Not', 1),
 ('sure', 1),
 ('who', 1),
 ('was', 1),
 ('more', 1),
 ('lost', 1),
 ('-', 1),
 ('flat', 1),
 ('characters', 1)]

In [22]:
top_n(DataFrame.text[3], 10)

[('Very', 1),
 ('little', 1),
 ('music', 1),
 ('or', 1),
 ('anything', 1),
 ('to', 1),
 ('speak', 1),
 ('of', 1),
 ('.', 1)]

Document-Term Matrix (DTM) is a matrix that represents the frequency of terms that occur in a collection of documents

In [25]:
sentence_1 = 'He is walking down the street.'
sentence_2 = 'She walked up then walked down the street yesterday.'

In [26]:
def create_dtm(series):
    cv = CountVectorizer()
    dtm = cv.fit_transform(series)
    dtm = dtm.todense()
    features = cv.get_feature_names_out()
    dtm_df = pd.DataFrame(dtm, columns = features)
    return dtm_df

In [27]:
create_dtm(DataFrame.text.head())

Unnamed: 0,about,acting,aimless,almost,and,angles,anything,artiness,as,attempting,...,trying,very,walked,was,when,white,who,whom,with,young
0,1,0,1,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
2,0,1,0,1,3,1,0,1,1,1,...,0,0,0,1,0,1,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


In [33]:
def top_n_tokens(text, sentiment, n):
    lgr = LogisticRegression(solver = 'lbfgs', max_iter = 2500, random_state = 1234)
    cv = CountVectorizer()
    dtm = cv.fit_transform(text)  # create  DTM
    lgr.fit(dtm, sentiment) # logistic regression model
    coefs = lgr.coef_[0]  # Get coefficients
    features = cv.get_feature_names_out()  # Create features/column names
    df = pd.DataFrame({'Tokens' : features, 'Coefficients' : coefs})
    return df.nlargest(n, 'Coefficients')

In [30]:
top_n_tokens(DataFrame.text, DataFrame.label, 5)

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139


In [31]:
top_n_tokens(DataFrame.text, DataFrame.label, 8)

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408


In [32]:
top_n_tokens(DataFrame.text, DataFrame.label, 25)

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408
908,excellent,1.009914
2203,right,0.985806


In [34]:
def bottom_n_tokens(text, sentiment, n):
    lgr = LogisticRegression(solver = 'lbfgs', max_iter = 2500, random_state = 1234)
    cv = CountVectorizer()
    dtm = cv.fit_transform(text) # create  DTM
    lgr.fit(dtm, sentiment) # Fit the logistic regression model
    coefs = lgr.coef_[0] # Get the coefficients
    features = cv.get_feature_names_out() # Create the features / column names
    df = pd.DataFrame({'Tokens' : features, 'Coefficients' : coefs})
    return df.nsmallest(n, 'Coefficients')

In [35]:
bottom_n_tokens(DataFrame.text, DataFrame.label, 4)

Unnamed: 0,Tokens,Coefficients
222,bad,-1.872751
211,awful,-1.334554
2530,stupid,-1.175416
441,cheap,-1.139512


In [36]:
bottom_n_tokens(DataFrame.text, DataFrame.label, 10)

Unnamed: 0,Tokens,Coefficients
222,bad,-1.872751
211,awful,-1.334554
2530,stupid,-1.175416
441,cheap,-1.139512
1802,no,-1.137234
893,even,-1.091436
3017,would,-1.047931
3012,worst,-1.039231
2923,waste,-1.038206
1819,nothing,-0.973472


In [39]:
def polarity_subjectivity(text, print_results = False):
    tb = TextBlob(text)
    if print_results:
        print(f"Polarity is {round(tb.sentiment[0], 2)} and subjectivity is {round(tb.sentiment[1], 2)}.")
    else:
        return(tb.sentiment[0], tb.sentiment[1])

polarity_subjectivity(DataFrame.text[0], print_results = True)

Polarity is 0.18 and subjectivity is 0.4.


In [40]:
polarity_subjectivity(DataFrame.text[0], print_results = True)

Polarity is -0.12 and subjectivity is 0.51.


In [41]:

def token_count(string):
    return len(word_tokenize(string)) #counts the number of tokens in a  string

def series_tokens(series):  #token_count function to a given Pandas Series
    return series.apply(token_count)


In [42]:
series_tokens(DataFrame.text.head(10))

0    18
1    21
2    33
3     9
4    22
5    27
6     4
7    17
8     4
9    11
Name: text, dtype: int64

In [43]:
series_tokens(DataFrame.text.head(5))

0    18
1    21
2    33
3     9
4    22
Name: text, dtype: int64

In [44]:
def series_polarity_subjectivity(series):
    return series.apply(polarity_subjectivity)


In [45]:
series_polarity_subjectivity(DataFrame['text'].head(10))

0                                 (0.18, 0.395)
1    (0.014583333333333337, 0.4201388888888889)
2    (-0.12291666666666666, 0.5145833333333333)
3                  (-0.24375000000000002, 0.65)
4                                    (1.0, 0.3)
5                                   (-0.1, 0.5)
6                                   (-0.2, 0.0)
7                     (0.7, 0.6000000000000001)
8                                   (-0.2, 0.5)
9                                    (0.7, 0.8)
Name: text, dtype: object

In [47]:
series_polarity_subjectivity(DataFrame['text'][15:20])

15    (0.023333333333333338, 0.2833333333333333)
16     (0.29285714285714287, 0.5380952380952381)
17                             (-0.0625, 0.1875)
18                                (0.675, 0.825)
19     (0.12841880341880338, 0.5698163168401262)
Name: text, dtype: object

In [48]:
def complexity(string):
    total_tokens = word_tokenize(string) # tokens
    unique_tokens = set(word_tokenize(string)) # set of all tokens (keeps only unique values)
    return len(unique_tokens) / len(total_tokens) #complexity measure

In [50]:
DataFrame.text.head(5).apply(complexity)

0    0.722222
1    0.952381
2    0.848485
3    1.000000
4    1.000000
Name: text, dtype: float64

In [49]:
DataFrame.text[10:15].apply(complexity)

10    1.000000
11    0.944444
12    0.750000
13    1.000000
14    0.947368
Name: text, dtype: float64

In [52]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [53]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [54]:
english_stop_words = stopwords.words('english')

In [55]:
def stopword_remover(string):
    tokens = word_tokenize(string) # Tokenize string
    english_stopwords = stopwords.words('english')  # list of English stopwords
    return [w for w in tokens if w.lower() not in english_stopwords] # Return non-stopwords

In [56]:
DataFrame.text.head(5).apply(stopword_remover)

0                                                                                                 [,, ,, slow-moving, ,, aimless, movie, distressed, ,, drifting, young, man, .]
1                                                                                                        [sure, lost, -, flat, characters, audience, ,, nearly, half, walked, .]
2    [Attempting, artiness, black, &, white, clever, camera, angles, ,, movie, disappointed, -, became, even, ridiculous, -, acting, poor, plot, lines, almost, non-existent, .]
3                                                                                                                                            [little, music, anything, speak, .]
4                                                                                                     [best, scene, movie, Gerardo, trying, find, song, keeps, running, head, .]
Name: text, dtype: object

In [57]:
DataFrame.text[6:11].apply(stopword_remover)

6                                                    [Wasted, two, hours, .]
7     [Saw, movie, today, thought, good, effort, ,, good, messages, kids, .]
8                                                      [bit, predictable, .]
9                       [Loved, casting, Jimmy, Buffet, science, teacher, .]
10                                                 [baby, owls, adorable, .]
Name: text, dtype: object

In [58]:
string_1 = "TomAndJerryAreFun"
string_2 = "Tom&JerryAreFun"
string_3 = "TomAndJerryAreFun!"

print(f"String_1: {string_1.isalpha()}\n")
print(f"String_2: {string_2.isalpha()}\n")
print(f"String_3: {string_3.isalpha()}")

String_1: True

String_2: False

String_3: False


In [59]:
def stopword_nonalpha_remover(string):
    return [x for x in stopword_remover(string) if x.isalpha()]


In [60]:
DataFrame.text.head().apply(stopword_nonalpha_remover)

0                                                                                                [aimless, movie, distressed, drifting, young, man]
1                                                                                    [sure, lost, flat, characters, audience, nearly, half, walked]
2    [Attempting, artiness, black, white, clever, camera, angles, movie, disappointed, became, even, ridiculous, acting, poor, plot, lines, almost]
3                                                                                                                  [little, music, anything, speak]
4                                                                           [best, scene, movie, Gerardo, trying, find, song, keeps, running, head]
Name: text, dtype: object

In [61]:
DataFrame.text[8:12].apply(stopword_nonalpha_remover)

8                                             [bit, predictable]
9              [Loved, casting, Jimmy, Buffet, science, teacher]
10                                        [baby, owls, adorable]
11    [movie, showed, lot, Florida, best, made, look, appealing]
Name: text, dtype: object