In [3]:
import pandas as pd
features = pd.read_csv('dictionary.txt', sep = '|', header = None)
features.head()

Unnamed: 0,0,1
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [5]:
labels = pd.read_csv('sentiment_labels.txt', sep = '|')
labels.head()

Unnamed: 0,phrase ids,sentiment values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [6]:
print(features.shape)
print(labels.shape)

(239232, 2)
(239232, 2)


In [8]:
features.rename(columns = {0:'phrase', 1 : 'id'}, inplace = True)
features.head()

Unnamed: 0,phrase,id
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [9]:
df = features.join(labels.set_index('phrase ids'), on='id')
df.head()

Unnamed: 0,phrase,id,sentiment values
0,!,0,0.5
1,! ',22935,0.52778
2,! '',18235,0.5
3,! Alas,179257,0.44444
4,! Brilliant,22936,0.86111


In [10]:
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,phrase,sentiment values
0,!,0.5
1,! ',0.52778
2,! '',0.5
3,! Alas,0.44444
4,! Brilliant,0.86111


In [15]:
df_categorical = df.copy()


In [16]:
def categorize(df):
    if df['sentiment values'] >= 0 and df['sentiment values'] <= 0.2:
        return 'very negative'
    elif df['sentiment values'] > 0.2 and df['sentiment values'] <= 0.4:
        return 'negative'
    elif df['sentiment values'] > 0.4 and df['sentiment values'] <= 0.6:
        return 'neutral'
    elif df['sentiment values'] > 0.6 and df['sentiment values'] <= 0.8:
        return 'positive'
    else:
        return 'very positive'

In [17]:
df_categorical['sentiment values'].value_counts()

sentiment values
0.50000    45232
0.55556     8867
0.44444     8087
0.51389     7026
0.61111     6263
           ...  
0.17708        1
0.54861        1
0.14583        1
0.38194        1
0.86806        1
Name: count, Length: 142, dtype: int64

In [18]:
df_categorical['sentiment'] = df_categorical.apply(categorize, axis = 1)
df_categorical.drop(columns=['sentiment values'], inplace = True)
df_categorical.head()

Unnamed: 0,phrase,sentiment
0,!,neutral
1,! ',neutral
2,! '',neutral
3,! Alas,neutral
4,! Brilliant,very positive


In [19]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [20]:
df_categorical['sentiment'].value_counts()

sentiment
neutral          119449
positive          50148
negative          43028
very positive     15255
very negative     11352
Name: count, dtype: int64

Using Regular Expressions:
1)  Used for tokenization
2)  Useful methods: findall() and split()
3)  '\W', '\w' - words   '\S', '\s'  -  spaces.
4)  caps mean exclude, smalls mean include.
5)  '\w+' more than one character.

Pre-processing text data:
1)  Removing punctuations
2)  Tokenization
3)  Removing Stopwords
4)  Lemmatizing
5)  Stemming

In [21]:
pd.set_option('display.max_colwidth', 100)
df_categorical.tail()

Unnamed: 0,phrase,sentiment
239227,zoning ordinances to protect your community from the dullest science fiction,very negative
239228,zzzzzzzzz,very negative
239229,élan,neutral
239230,É,neutral
239231,É um passatempo descompromissado,neutral


Remove Punctuations:

In [22]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [24]:
df_categorical.describe()

Unnamed: 0,phrase,sentiment
count,239231,239232
unique,239231,5
top,!,neutral
freq,1,119449


In [25]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239232 entries, 0 to 239231
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   phrase     239231 non-null  object
 1   sentiment  239232 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [32]:
# Remove null values and convert column to string for iteration.

df_categorical['phrase'] = df_categorical['phrase'].astype('string')
df_categorical.dropna(axis = 0, inplace=True)

df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 239231 entries, 0 to 239231
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   phrase     239231 non-null  string
 1   sentiment  239231 non-null  object
 2   no_punct   239231 non-null  object
dtypes: object(2), string(1)
memory usage: 7.3+ MB


In [34]:
def removing_punctuations(text):
    res = "".join([char for char in text or "" if char not in string.punctuation])
    return res


df_categorical['no_punct'] = df_categorical['phrase'].apply(lambda x: removing_punctuations(x))

df_categorical.tail()

Unnamed: 0,phrase,sentiment,no_punct
239227,zoning ordinances to protect your community from the dullest science fiction,very negative,zoning ordinances to protect your community from the dullest science fiction
239228,zzzzzzzzz,very negative,zzzzzzzzz
239229,élan,neutral,élan
239230,É,neutral,É
239231,É um passatempo descompromissado,neutral,É um passatempo descompromissado


Tokenization:


In [35]:
#Regular Expressions

import re

def tokenization(text):
    tokens = re.split('\W+', text)
    return tokens




In [36]:
df_categorical['tokens'] = df_categorical['no_punct'].apply(lambda x: tokenization(x.lower()))

df_categorical.tail()

Unnamed: 0,phrase,sentiment,no_punct,tokens
239227,zoning ordinances to protect your community from the dullest science fiction,very negative,zoning ordinances to protect your community from the dullest science fiction,"[zoning, ordinances, to, protect, your, community, from, the, dullest, science, fiction]"
239228,zzzzzzzzz,very negative,zzzzzzzzz,[zzzzzzzzz]
239229,élan,neutral,élan,[élan]
239230,É,neutral,É,[é]
239231,É um passatempo descompromissado,neutral,É um passatempo descompromissado,"[é, um, passatempo, descompromissado]"


In [40]:
df_categorical.iloc[100000:100100,:]

Unnamed: 0,phrase,sentiment,no_punct,tokens
100001,"credulous , unassuming , subordinate",negative,credulous unassuming subordinate,"[credulous, unassuming, subordinate]"
100002,"credulous , unassuming , subordinate subjects",negative,credulous unassuming subordinate subjects,"[credulous, unassuming, subordinate, subjects]"
100003,"credulous , unassuming , subordinate subjects .",neutral,credulous unassuming subordinate subjects,"[credulous, unassuming, subordinate, subjects, ]"
100004,creed,neutral,creed,[creed]
100005,creep,negative,creep,[creep]
...,...,...,...,...
100096,"crime thriller , quirky character study",neutral,crime thriller quirky character study,"[crime, thriller, quirky, character, study]"
100097,"crime thriller , quirky character study ,",neutral,crime thriller quirky character study,"[crime, thriller, quirky, character, study, ]"
100098,"crime thriller , quirky character study , third-rate romance",neutral,crime thriller quirky character study thirdrate romance,"[crime, thriller, quirky, character, study, thirdrate, romance]"
100099,"crime thriller , quirky character study , third-rate romance and",neutral,crime thriller quirky character study thirdrate romance and,"[crime, thriller, quirky, character, study, thirdrate, romance, and]"


Removing Stopwords

In [43]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')

In [44]:
def removing_stopwords(tokens):
    res = [word for word in tokens if word not in stopwords]
    return res

In [45]:
df_categorical['no_stopwords'] = df_categorical['tokens'].apply(lambda x: removing_stopwords(x))

df_categorical.tail()

Unnamed: 0,phrase,sentiment,no_punct,tokens,no_stopwords
239227,zoning ordinances to protect your community from the dullest science fiction,very negative,zoning ordinances to protect your community from the dullest science fiction,"[zoning, ordinances, to, protect, your, community, from, the, dullest, science, fiction]","[zoning, ordinances, protect, community, dullest, science, fiction]"
239228,zzzzzzzzz,very negative,zzzzzzzzz,[zzzzzzzzz],[zzzzzzzzz]
239229,élan,neutral,élan,[élan],[élan]
239230,É,neutral,É,[é],[é]
239231,É um passatempo descompromissado,neutral,É um passatempo descompromissado,"[é, um, passatempo, descompromissado]","[é, um, passatempo, descompromissado]"


Stemming:

In [46]:
ps = nltk.PorterStemmer()

In [47]:
print(ps.stem('stand'), ps.stem('standing'), ps.stem('stands'))

stand stand stand


In [48]:
# Some tradeoff as well

print(ps.stem('meaning'), ps.stem('meanness'))

mean mean


In [49]:
def stemming(text):
    res = [ps.stem(word) for word in text]
    return res

In [50]:
df_categorical['stemmed'] = df_categorical['no_stopwords'].apply(lambda x: stemming(x))

df_categorical.tail()

Unnamed: 0,phrase,sentiment,no_punct,tokens,no_stopwords,stemmed
239227,zoning ordinances to protect your community from the dullest science fiction,very negative,zoning ordinances to protect your community from the dullest science fiction,"[zoning, ordinances, to, protect, your, community, from, the, dullest, science, fiction]","[zoning, ordinances, protect, community, dullest, science, fiction]","[zone, ordin, protect, commun, dullest, scienc, fiction]"
239228,zzzzzzzzz,very negative,zzzzzzzzz,[zzzzzzzzz],[zzzzzzzzz],[zzzzzzzzz]
239229,élan,neutral,élan,[élan],[élan],[élan]
239230,É,neutral,É,[é],[é],[é]
239231,É um passatempo descompromissado,neutral,É um passatempo descompromissado,"[é, um, passatempo, descompromissado]","[é, um, passatempo, descompromissado]","[é, um, passatempo, descompromissado]"


Lemmatizing:

In [53]:
wn = nltk.WordNetLemmatizer()

print(wn.lemmatize('meanness'), wn.lemmatize('meaning'))
print(wn.lemmatize('foot'), wn.lemmatize('feet'))
print(ps.stem('foot'), ps.stem('feet'))

meanness meaning
foot foot
foot feet


In [54]:
def lemmatizing(text):
    res = [wn.lemmatize(word) for word in text]
    return res

In [55]:
df_categorical['lemmatized'] = df_categorical['no_stopwords'].apply(lambda x: lemmatizing(x))

df_categorical.tail()

Unnamed: 0,phrase,sentiment,no_punct,tokens,no_stopwords,stemmed,lemmatized
239227,zoning ordinances to protect your community from the dullest science fiction,very negative,zoning ordinances to protect your community from the dullest science fiction,"[zoning, ordinances, to, protect, your, community, from, the, dullest, science, fiction]","[zoning, ordinances, protect, community, dullest, science, fiction]","[zone, ordin, protect, commun, dullest, scienc, fiction]","[zoning, ordinance, protect, community, dullest, science, fiction]"
239228,zzzzzzzzz,very negative,zzzzzzzzz,[zzzzzzzzz],[zzzzzzzzz],[zzzzzzzzz],[zzzzzzzzz]
239229,élan,neutral,élan,[élan],[élan],[élan],[élan]
239230,É,neutral,É,[é],[é],[é],[é]
239231,É um passatempo descompromissado,neutral,É um passatempo descompromissado,"[é, um, passatempo, descompromissado]","[é, um, passatempo, descompromissado]","[é, um, passatempo, descompromissado]","[é, um, passatempo, descompromissado]"


Count Vectorization

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
X_counts = count_vector.fit_transform(df_categorical['lemmatized'].astype("string"))

print(X_counts.shape)
print(count_vector.get_feature_names())

(239231, 17793)


In [60]:
#Lets do it to a small subset

df_sample = df_categorical[1000:1020]
df_sample.head()

Unnamed: 0,phrase,sentiment,no_punct,tokens,no_stopwords,stemmed,lemmatized
1000,"'re as happy listening to movies as you are watching them , and the slow parade of human frailty...",positive,re as happy listening to movies as you are watching them and the slow parade of human frailty f...,"[re, as, happy, listening, to, movies, as, you, are, watching, them, and, the, slow, parade, of,...","[happy, listening, movies, watching, slow, parade, human, frailty, fascinates]","[happi, listen, movi, watch, slow, parad, human, frailti, fascin]","[happy, listening, movie, watching, slow, parade, human, frailty, fascinates]"
1001,'re at the right film,positive,re at the right film,"[re, at, the, right, film]","[right, film]","[right, film]","[right, film]"
1002,'re at the right film .,positive,re at the right film,"[re, at, the, right, film, ]","[right, film, ]","[right, film, ]","[right, film, ]"
1003,'re back,neutral,re back,"[re, back]",[back],[back],[back]
1004,'re being streamed,neutral,re being streamed,"[re, being, streamed]",[streamed],[stream],[streamed]


In [61]:
count_vector_sample = CountVectorizer()
X_counts_sample = count_vector_sample.fit_transform(df_sample['lemmatized'].astype("string"))

print(X_counts_sample.shape)
print(count_vector_sample.get_feature_names())

(20, 54)
['back', 'better', 'big', 'bored', 'burnt', 'carol', 'christmas', 'clever', 'clueless', 'coming', 'content', 'convinced', 'dead', 'decent', 'definitely', 'director', 'dole', 'famous', 'fascinates', 'film', 'frailty', 'good', 'happy', 'home', 'human', 'inept', 'life', 'listening', 'looking', 'machine', 'magic', 'manages', 'marathon', 'mean', 'might', 'movie', 'nt', 'outing', 'parade', 'piece', 'pseudobio', 'right', 'screen', 'slow', 'spectacular', 'staying', 'streamed', 'time', 'tv', 'vine', 'watching', 'woman', 'wonderful', 'xfiles']


In [63]:
#count_vector_sample is a sparse matrix
print(type(X_counts_sample))

count_vector_sample_df = pd.DataFrame(X_counts_sample.toarray())
count_vector_sample_df.columns = count_vector_sample.get_feature_names()
count_vector_sample_df

<class 'scipy.sparse.csr.csr_matrix'>


Unnamed: 0,back,better,big,bored,burnt,carol,christmas,clever,clueless,coming,...,spectacular,staying,streamed,time,tv,vine,watching,woman,wonderful,xfiles
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
7,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
8,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Similarly we can do n-gram vectorization.
"My name is Srinath."
2-gram or bigram -- "My name", "name is", "is Srinath"


In [64]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vector = CountVectorizer(ngram_range=(1,3))
ngram_counts = ngram_vector.fit_transform(df_categorical['lemmatized'].astype("string"))

print(ngram_counts.shape)
print(ngram_vector.get_feature_names())


df_sample = df_categorical[1000:1020]


ngram_vector_sample = CountVectorizer(ngram_range=(1,3))
ngram_counts_sample = ngram_vector_sample.fit_transform(df_sample['lemmatized'].astype("string"))

print(ngram_counts_sample.shape)
print(ngram_vector_sample.get_feature_names())

print(type(ngram_counts_sample))

ngram_vector_sample_df = pd.DataFrame(ngram_counts_sample.toarray())
ngram_vector_sample_df.columns = ngram_vector_sample.get_feature_names()
ngram_vector_sample_df

(239231, 195033)
(20, 137)
['back', 'better', 'better staying', 'better staying home', 'big', 'big screen', 'big screen magic', 'bored', 'bored christmas', 'bored christmas carol', 'burnt', 'burnt wonderful', 'burnt wonderful life', 'carol', 'carol might', 'carol might movie', 'christmas', 'christmas carol', 'christmas carol might', 'clever', 'clever pseudobio', 'clever pseudobio manages', 'clueless', 'clueless inept', 'coming', 'content', 'content clever', 'content clever pseudobio', 'convinced', 'convinced mean', 'convinced mean machine', 'convinced woman', 'convinced woman spectacular', 'dead', 'dead vine', 'decent', 'decent tv', 'decent tv outing', 'definitely', 'definitely convinced', 'definitely convinced woman', 'director', 'director life', 'dole', 'dole piece', 'dole piece famous', 'famous', 'famous director', 'famous director life', 'fascinates', 'film', 'frailty', 'frailty fascinates', 'good', 'good time', 'good time dole', 'happy', 'happy listening', 'happy listening movie',

Unnamed: 0,back,better,better staying,better staying home,big,big screen,big screen magic,bored,bored christmas,bored christmas carol,...,watching,watching slow,watching slow parade,watching xfiles,woman,woman spectacular,wonderful,wonderful life,wonderful life marathon,xfiles
0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,1,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
7,0,1,1,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
8,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,1,1,1,0
9,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,1,1,1,0
