In [1]:
import pandas as pd
import nltk
import warnings
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn import linear_model

In [2]:
SEED = 57
nltk.download('stopwords')
nltk.download('punkt')
eng_stopwords = set(stopwords.words('english'))
DATASET_PATH = '~/Documents/ML-DL/imdb_review_classification/IMDB Dataset.csv'
stemmer = nltk.stem.PorterStemmer()
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /Users/goava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/goava/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df_tv = pd.read_csv(DATASET_PATH)
df_tv.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df_tv.sentiment = df_tv.sentiment.map({'positive': 1, 'negative': 0})
df_tv.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [5]:
df_tv.isna().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df_tv.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [7]:
def tokenize(text):
  text = text.lower()
  tokens = [stemmer.stem(w) for w in nltk.word_tokenize(text) if w.isalpha() and w not in eng_stopwords]
  return tokens

In [8]:
df_tr, df_vl = model_selection.train_test_split(df_tv, test_size=0.2, random_state=SEED, stratify=df_tv.sentiment, shuffle=True)
df_tr.shape, df_vl.shape

((40000, 2), (10000, 2))

In [9]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, max_df=0.9, ngram_range=(1,2), max_features=2000)

In [10]:
vectorizer.fit(df_tr.review)

In [11]:
x_train = vectorizer.transform(df_tr.review)
x_valid = vectorizer.transform(df_vl.review)

In [12]:
x_train = x_train.toarray()
x_valid = x_valid.toarray()

In [13]:
type(x_train), x_train.shape

(numpy.ndarray, (40000, 2000))

In [15]:
df_tr[vectorizer.get_feature_names_out() + '_tfidf'] = x_train
df_vl[vectorizer.get_feature_names_out() + '_tfidf'] = x_valid

In [17]:
df_tr.head()

Unnamed: 0,review,sentiment,abandon_tfidf,abil_tfidf,abl_tfidf,absolut_tfidf,absurd_tfidf,abus_tfidf,academi_tfidf,accent_tfidf,...,year later_tfidf,year old_tfidf,yet_tfidf,york_tfidf,young_tfidf,young man_tfidf,younger_tfidf,youth_tfidf,zero_tfidf,zombi_tfidf
24386,"Everyone knows the so-called plot, so let me c...",0,0.0,0.0,0.0,0.107303,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4560,The cookie-cutter gets to work overtime in thi...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.205185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3702,"Hi guys, this is my first review and I would h...",0,0.0,0.153488,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.143364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24044,"It's some years since I've seen this movie, so...",1,0.0,0.0,0.0,0.122882,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11517,I have to admit that I approached this movie w...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_tr.drop(columns=['review']).to_csv('train.csv', index=False)
df_vl.drop(columns=['review']).to_csv('valid.csv', index=False)