In [2]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'imdb-dataset-of-50k-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F134715%2F320111%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240728%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240728T061921Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D35d317a559e1b4a9f2c677e1072bbf32c8245b7e8fe7f026b3c214217a8ff716c4956c463e390c86f34bf08b28854c719b8bf667499d15d78a0e5a43425068d067283b2427fbc2bf7ff6b1bcabf6b95300c32fa7ac58c54359aeac383b6d0a15730de8a04d0c9438fa2119f8ac25e5214b631fddd65f476ac43df4c35d64b7c67b7e836b52817c2f783b44599576ad7827e268df8a1680e91fbcf69578a1a11609cdd55b5fc85a4109e7f6c0a358aba4f04dbe75feab8ae870f8b1f7438382bee3bb8e73be7a480711ba5271aec2a83755b885ff653aac5092085af79569a03571c2954b9bdad709837e770a793cdb58cb5351570e0ce8d19d56aa7e74aaf4bb,glove6b100dtxt:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F715814%2F1246668%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240728%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240728T061921Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D599480ea062c3d7ac6d3c2f051c23b9b69f564740bac5655850d47097d33b5c7916da7532eba008e1b7dc19efb99f5aa824ddd657dc50168c63e4045c300192bbacdb49ae1b42d5f6e703d8f61124e1c381b14ddf36061b2b5134561a21981341c8e383707a0041daaf31e59861fb2c4d2a5d80c087eee5e401816136cef323ad9eca9aa1b299dc90837f26a29258ce4520914eccf8d2d95df7db91bc24eea57c1e3ea78ba1f182f659d2611f19c5b8f42f40296f5fa9ec8384fa5376c85b1a7d76ce8719d973fdcb1d16ef85d8d4e8b309b5a4a8e8aa605d2d76e3b6351842bcbaeaf656c5029ff8d31354c73079824bd3c1ff6c4eaaddd7c8fd05cdbb09d2c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading imdb-dataset-of-50k-movie-reviews, 26962657 bytes compressed
Downloaded and uncompressed: imdb-dataset-of-50k-movie-reviews
Downloading glove6b100dtxt, 137847651 bytes compressed
Downloaded and uncompressed: glove6b100dtxt
Data source import complete.


In [3]:
import IPython
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
df=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
X=df['review'].to_list()
Y=df['sentiment'].to_list()

In [5]:
df=df.iloc[:20000]

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [8]:
# X = df.iloc[:,0:1]
# y = df['sentiment']

In [9]:
df['sentiment'].value_counts()

sentiment
negative    10097
positive     9903
Name: count, dtype: int64

In [10]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [11]:
df.duplicated().sum()

74

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.duplicated().sum()

0

In [14]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [15]:
df['review'] = df['review'].apply(remove_tags)

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [17]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [18]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [19]:

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [21]:
df['review'] = df['review'].apply(remove_stopwords)

In [22]:
from textblob import TextBlob
from multiprocessing import Pool, cpu_count

In [23]:

# # Function to correct spelling using TextBlob
# def correct_spelling(text):
#     return str(TextBlob(text).correct())

# # Function to apply the spelling correction in parallel
# def parallel_apply(df, func, num_cores=None):
#     if num_cores is None:
#         num_cores = cpu_count()

#     df_split = np.array_split(df, num_cores)
#     pool = Pool(num_cores)
#     df = pd.concat(pool.map(func, df_split))
#     pool.close()
#     pool.join()
#     return df

# # Function to apply correct_spelling to a DataFrame
# def correct_spelling_df(df):
#     df['review'] = df['review'].apply(correct_spelling)
#     return df

# # Apply the spelling correction in parallel
# df = parallel_apply(df, correct_spelling_df)

# # Display the DataFrame with corrected reviews
# print(df)

In [24]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Initialize the stemmer
stemmer = PorterStemmer()

# Stemming function using NLTK
def stem_text(text):
    tokens = word_tokenize(text)
    return [stemmer.stem(token) for token in tokens]

df['review'] = df['review'].apply(stem_text)

In [26]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, h...",positive
1,"[wonder, littl, product, ., film, techniqu, un...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, there, 's, famili, littl, boy, (, jake...",negative
4,"[petter, mattei, 's, ``, love, time, money, ''...",positive


In [27]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [28]:
X

Unnamed: 0,review
0,"[one, review, mention, watch, 1, oz, episod, h..."
1,"[wonder, littl, product, ., film, techniqu, un..."
2,"[thought, wonder, way, spend, time, hot, summe..."
3,"[basic, there, 's, famili, littl, boy, (, jake..."
4,"[petter, mattei, 's, ``, love, time, money, ''..."
...,...
19995,"[ok., starter, ,, taxi, driver, amaz, ., thi, ..."
19996,"[sort, hard, say, it, ,, greatli, enjoy, ``, t..."
19997,"[still, like, though, ., warren, beatti, fair,..."
19998,"[could, still, use, black, adder, even, today,..."


In [29]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
19995    negative
19996    negative
19997    positive
19998    positive
19999    negative
Name: sentiment, Length: 19926, dtype: object

In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
encoder=LabelEncoder()
y=encoder.fit_transform(y)

In [32]:
print(y)
print(type(y))

[1 1 1 ... 1 1 0]
<class 'numpy.ndarray'>


In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42,stratify=y)

In [34]:
x_train.shape

(15940, 1)

In [35]:
x_test.shape

(3986, 1)

In [36]:
print(x_test)

                                                  review
17913  [famili, (, and, entir, sold, sneak, preview, ...
16292  [film, one, resound, familiar, it, ., earthi, ...
13853  [jack, palanc, ,, (, john, wit, ), ,, usual, b...
18276  [talkshow, spike, feresten, one, show, definit...
16205  [alway, look, forward, movi, tv, ., get, dvd, ...
...                                                  ...
13163  [movi, predict, end, use, to, ,, great, see, m...
5313   [movi, confus, religi, ethic, ideal, much, fai...
12166  [robert, jordan, televis, star, ., robert, jor...
12570  [great, movi, saw, it, ., say, one, favorit, m...
12137  [entertainingli, tacky'n'trashi, distaff, ``, ...

[3986 rows x 1 columns]


In [37]:
# Applying BOw= Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
cv=CountVectorizer()

In [39]:
# Assuming x_train['review'] contains lists of words, join them into strings
x_train['review'] = x_train['review'].apply(lambda words: ' '.join(words))
x_test['review'] = x_test['review'].apply(lambda words: ' '.join(words))

# Now apply CountVectorizer
x_train_bow = cv.fit_transform(x_train['review'])
x_test_bow = cv.transform(x_test['review'])

In [40]:
print(x_train_bow.shape)
print(x_test_bow.shape)
print(type(x_train_bow))
print(type(x_test_bow))

(15940, 49525)
(3986, 49525)
<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>


In [41]:
print(x_test_bow)

  (0, 1235)	1
  (0, 1256)	2
  (0, 1606)	1
  (0, 2344)	1
  (0, 3213)	1
  (0, 3472)	1
  (0, 4125)	1
  (0, 4631)	1
  (0, 4737)	1
  (0, 6147)	1
  (0, 6943)	1
  (0, 8045)	1
  (0, 9044)	3
  (0, 9541)	1
  (0, 9841)	2
  (0, 10065)	1
  (0, 10090)	1
  (0, 10342)	1
  (0, 11219)	1
  (0, 11421)	1
  (0, 11497)	1
  (0, 11599)	1
  (0, 12180)	1
  (0, 13795)	1
  (0, 14326)	1
  :	:
  (3985, 42714)	1
  (3985, 43319)	1
  (3985, 43403)	1
  (3985, 43620)	2
  (3985, 43631)	1
  (3985, 44387)	1
  (3985, 44501)	1
  (3985, 44688)	1
  (3985, 44718)	1
  (3985, 44825)	1
  (3985, 45013)	1
  (3985, 45892)	1
  (3985, 46342)	1
  (3985, 46537)	1
  (3985, 47001)	1
  (3985, 47046)	1
  (3985, 47068)	1
  (3985, 47288)	1
  (3985, 47323)	1
  (3985, 47899)	1
  (3985, 48191)	1
  (3985, 48388)	1
  (3985, 48438)	1
  (3985, 48551)	1
  (3985, 48744)	1


In [42]:
from sklearn.naive_bayes import GaussianNB

In [43]:
gnb=GaussianNB()
gnb.fit(x_train_bow.toarray(),y_train)

In [44]:
y_pred = gnb.predict(x_test_bow.toarray())

In [45]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.6379829402910185

In [46]:
#  Model preformance measure
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.61      0.80      0.69      2010
           1       0.70      0.47      0.56      1976

    accuracy                           0.64      3986
   macro avg       0.65      0.64      0.63      3986
weighted avg       0.65      0.64      0.63      3986



In [47]:
# # Train and Test Accuracy
# print('Test Accuracy:',accuracy_score(y_test,y_pred))
# print('Training Accuracy:',accuracy_score(y_train,gnb.predict(x_train_bow.toarray())))

In [50]:
# # See Confusion Matrix
# import scikitplot as skplt
# skplt.metrics.plot_confusion_matrix(y_test,y_pred)

In [53]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(x_train_bow.toarray(),y_train)
y_pred = rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.8442047165077772

In [61]:
cv = CountVectorizer(max_features=3000)

x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(x_train_bow,y_train)
y_pred = rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.8258906171600602

In [64]:
cv = CountVectorizer(ngram_range=(3,3),max_features=3000)

x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(x_train_bow,y_train)
y_pred = rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.6394882087305569

# Using TfIdf

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
tfidf = TfidfVectorizer()

In [67]:
x_train_tf = tfidf.fit_transform(x_train['review']).toarray()
x_test_tf = tfidf.transform(x_test['review']).toarray()

In [68]:
rf = RandomForestClassifier()

rf.fit(x_train_tf,y_train)
y_pred = rf.predict(x_test_tf)

accuracy_score(y_test,y_pred)

0.8411941796287005

In [74]:
#  Model preformance measure
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84      2010
           1       0.85      0.83      0.84      1976

    accuracy                           0.84      3986
   macro avg       0.84      0.84      0.84      3986
weighted avg       0.84      0.84      0.84      3986

