In [25]:
#Import pandas
import pandas as pd

# Import the movie review data as a data frame

In [26]:
# For train
df_review = pd.read_csv('labeledTrainData.tsv',sep='\t')

In [27]:
df_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [28]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


# Convert all text to lowercase letters

In [29]:
# converting all strings to lower case using lambda expressions
df_review=df_review.applymap(lambda st:st.lower() if type(st)==str else st)

In [30]:
df_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi..."
2,7759_3,0,the film starts with a manager (nicholas bell)...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


# Remove punctuation and special characters from the text.

In [31]:
# importing string
import string

In [32]:
# creating a function to remove punctuations
exclusion=set(string.punctuation)
def remove_punc(s):
    return ''.join(ch for ch in s if ch not in exclusion)

In [33]:
# applying the remove_punc function to the data frame
df_review.review=df_review.review.apply(remove_punc)

In [34]:
df_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,with all this stuff going down at the moment w...
1,2381_9,1,the classic war of the worlds by timothy hines...
2,7759_3,0,the film starts with a manager nicholas bell g...
3,3630_4,0,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...


# Remove stop words.

In [35]:
# importing stop words library
import nltk
from nltk.corpus import stopwords

In [36]:
#removing stop words
sw=set(stopwords.words("english"))
df_review['review']=df_review['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

In [37]:
df_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war worlds timothy hines entertaining ...
2,7759_3,0,film starts manager nicholas bell giving welco...
3,3630_4,0,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...


# Apply NLTK’s PorterStemmer

In [59]:
#importing PorterStemmer
from nltk import PorterStemmer

In [60]:
# apply steaming 
stemmer=PorterStemmer()
df_review['review']=df_review['review'].str.split()
df_review['review']=df_review['review'].apply(lambda x:' '.join([stemmer.stem(y) for y in x]))

In [61]:
df_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff go moment mj ive start listen music watc...
1,2381_9,1,classic war world timothi hine entertain film ...
2,7759_3,0,film start manag nichola bell give welcom inve...
3,3630_4,0,must assum prai film greatest film opera ever ...
4,9495_8,1,superbl trashi wondrou unpretenti 80 exploit h...


# Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as the number of rows in your original data frame.

In [62]:
#Convert each text entry into a word-count vector

# importing count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [63]:
# Converting to word count vector
count_vector=CountVectorizer()
bag_of_words=count_vector.fit_transform(df_review['review'])

In [44]:
bag_of_words

<25000x92379 sparse matrix of type '<class 'numpy.int64'>'
	with 2439461 stored elements in Compressed Sparse Row format>

# Convert each text entry into a part-of-speech tag vector

In [45]:
# importing pos and word tokenize
from nltk import pos_tag_sents
from nltk import word_tokenize

In [46]:
# Converting to POS
pos=pos_tag_sents(df_review['review'].apply(word_tokenize).tolist())

In [47]:
pos

[[('stuff', 'NN'),
  ('go', 'VB'),
  ('moment', 'NN'),
  ('mj', 'RB'),
  ('ive', 'JJ'),
  ('start', 'NN'),
  ('listen', 'VB'),
  ('music', 'NN'),
  ('watch', 'NN'),
  ('odd', 'JJ'),
  ('documentari', 'NN'),
  ('watch', 'NN'),
  ('wiz', 'NN'),
  ('watch', 'NN'),
  ('moonwalk', 'NN'),
  ('mayb', 'NN'),
  ('want', 'VBP'),
  ('get', 'NN'),
  ('certain', 'JJ'),
  ('insight', 'JJ'),
  ('guy', 'NN'),
  ('thought', 'VBD'),
  ('realli', 'JJ'),
  ('cool', 'JJ'),
  ('eighti', 'NN'),
  ('mayb', 'NNS'),
  ('make', 'VBP'),
  ('mind', 'NN'),
  ('whether', 'IN'),
  ('guilti', 'NN'),
  ('innoc', 'NN'),
  ('moonwalk', 'VBP'),
  ('part', 'NN'),
  ('biographi', 'VBP'),
  ('part', 'NN'),
  ('featur', 'NN'),
  ('film', 'NN'),
  ('rememb', 'NN'),
  ('go', 'VBP'),
  ('see', 'VB'),
  ('cinema', 'JJ'),
  ('origin', 'VB'),
  ('releas', 'NNS'),
  ('subtl', 'VBP'),
  ('messag', 'JJ'),
  ('mj', 'NN'),
  ('feel', 'VB'),
  ('toward', 'IN'),
  ('press', 'NN'),
  ('also', 'RB'),
  ('obviou', 'JJ'),
  ('messag', 'JJ'),


Answer: Observation is that original dimensions are 25000x92379 sparse matrix and bag of words matrix is same

# Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews (see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions of your tf-idf matrix. These dimensions should be the same as your bag-of-words matrix.

In [48]:
# importing tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
# converting to tfidf vector
tfidf=TfidfVectorizer()
feature_matrix=tfidf.fit_transform(df_review['review'])

In [50]:
print(feature_matrix)

  (0, 46146)	0.043925817139175274
  (0, 38881)	0.02803776628213118
  (0, 47029)	0.06132168727761918
  (0, 73810)	0.07381713686385905
  (0, 78343)	0.031544243615218015
  (0, 28180)	0.03251098143757476
  (0, 25841)	0.030752537967515932
  (0, 28403)	0.025265324095914823
  (0, 23898)	0.041251802376497324
  (0, 16800)	0.03164810893119639
  (0, 9271)	0.03363003768604581
  (0, 22567)	0.027496514339387632
  (0, 23842)	0.019393650135459115
  (0, 78479)	0.08512803374054681
  (0, 32872)	0.03354899867980853
  (0, 7118)	0.0358937786506478
  (0, 88955)	0.03725059838605022
  (0, 62437)	0.04354269100937043
  (0, 34589)	0.04296481505513276
  (0, 27406)	0.021832342402494355
  (0, 80009)	0.031262479172306186
  (0, 84018)	0.03097406526073594
  (0, 33579)	0.026585707815177818
  (0, 12683)	0.04501531144437924
  (0, 9702)	0.08181980847935362
  :	:
  (24999, 86221)	0.10301941978159103
  (24999, 55600)	0.0618161924909372
  (24999, 36499)	0.06664940747638765
  (24999, 53871)	0.1101285559823028
  (24999, 13870)	

In [58]:
feature_matrix.describe()

AttributeError: describe not found

Answer: Observation is that bag of words matrix dimensions are 25000x92379 sparse matrix and feature_matrix which is tfidf vector is same