In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv(r'D:\CODING\PYTHON\NLP\IMDB Dataset_1.csv')

In [3]:
# Limit the DataFrame to the first 10,000 rows
# This is useful for testing or when working with large datasets

df = temp_df.iloc[:10000]

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df.duplicated().sum()

17

In [9]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [10]:
df.duplicated().sum()

0

In [11]:
# Basic Preprocessing
# Remove tags
# lowercase
# remove stopwords

In [12]:
# Import necessary libraries
# Remove HTML tags
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [13]:
df['review'] = df['review'].apply(remove_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_tags)


In [14]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [15]:
# Lowercase the text

df['review'] = df['review'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x:x.lower())


In [16]:
# Remove stopwords

from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))


In [17]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertaining movie wwii german spy (julie...",positive
9996,"give break. anyone say ""good hockey movie""? kn...",negative
9997,movie bad movie. watching endless series bad h...,negative
9998,"movie probably made entertain middle school, e...",negative


In [18]:
# Feature Extraction

X = df.iloc[:,0:1]
y = df['sentiment']

In [19]:
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
9995,"fun, entertaining movie wwii german spy (julie..."
9996,"give break. anyone say ""good hockey movie""? kn..."
9997,movie bad movie. watching endless series bad h...
9998,"movie probably made entertain middle school, e..."


In [20]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [21]:
# Convert labels to numerical values
# This is necessary for training machine learning models


from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [22]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [23]:
# Split the dataset into training and testing sets
# This is important to evaluate the model's performance on unseen data

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [24]:
X_train.shape

(7986, 1)

In [25]:
# Applying TF-IDF

# Applying BoW

from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cv = CountVectorizer()

In [27]:
# Transform the training and testing data
# This converts the text data into a matrix of token counts
# The fit_transform method is used on the training data, and transform is used on the test
# This ensures that the same vocabulary is used for both training and testing sets
# This is essential for consistent feature extraction

# The toarray() method converts the sparse matrix to a dense format
# This is useful for algorithms that require dense input


X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [28]:
# Check the shape of the training data
# This will give the number of samples and features in the training set

X_train_bow.shape

(7986, 48282)

In [29]:
# Applying Naive Bayes Classifier
# This is a simple and effective algorithm for text classification tasks
# It assumes that the features are independent given the class label
# This is a common assumption in Naive Bayes classifiers, which makes them efficient for text classification tasks


from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [30]:
# Make predictions on the test set
# This will give the predicted labels for the test data
# This is essential for evaluating the model's performance on unseen data


y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.6324486730095142

In [31]:
# Confusion Matrix
# This is a useful tool for evaluating the performance of classification models
# It shows the number of correct and incorrect predictions for each class
# This is essential for understanding the model's performance and identifying areas for improvement


confusion_matrix(y_test,y_pred)

array([[717, 235],
       [499, 546]], dtype=int64)

In [32]:
# Applying Random Forest Classifier
# This is an ensemble learning method that combines multiple decision trees
# It is effective for both classification and regression tasks
# This is a common choice for text classification tasks due to its robustness and ability to handle high-dimensional data


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.85778668002003

In [33]:
# Applying Bag of Words (BoW) with CountVectorizer
# This is a common technique for text feature extraction
# It converts the text data into a matrix of token counts
# This is useful for machine learning algorithms that require numerical input
# from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.842764146219329

In [34]:
# Applying Bag of Words (BoW) with n-grams
# This is a common technique for text feature extraction
# It converts the text data into a matrix of token counts with n-grams
# This is useful for capturing context and relationships between words
# max_features=5000 


cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8402603905858789

## Using TfIdf

In [35]:


from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
tfidf = TfidfVectorizer()

In [37]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])

In [38]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.8462694041061593

In [39]:
import gensim

In [40]:
from gensim.models import Word2Vec,KeyedVectors

In [41]:
model = KeyedVectors.load_word2vec_format(r'D:\CODING\PYTHON\NLP\GoogleNews-vectors-negative300.bin',binary=True,limit=500000)

In [42]:
model['cricket'].shape

(300,)

In [43]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

In [44]:
sw_list

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [45]:
# Remove stopwords

X_train = X_train['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))
# Remove stopwords

X_test = X_test['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [46]:
import spacy
import en_core_web_sm
# Load the spacy model. This takes a few seconds.
nlp = en_core_web_sm.load()
# Process a sentence using the model
doc = nlp(X_train.values[0])
print(doc.vector)

[-0.11225057 -0.3323734  -0.09506241 -0.08899848  0.03331648  0.3578626
  0.24375589  0.25814155  0.20693804 -0.20502327  0.23241748 -0.1819785
 -0.1063184  -0.06482078 -0.10000932  0.20662282 -0.06277418 -0.13001649
  0.25962377 -0.2501759  -0.25346     0.42603675 -0.00766025  0.09086803
  0.07657377  0.09504889  0.14710668  0.2522116   0.16764915  0.17736492
  0.16027842 -0.02066578 -0.06542832  0.00271588 -0.22537896 -0.10483894
 -0.06766807  0.08306239 -0.20810121 -0.1844751  -0.11958811  0.1993304
 -0.17356977  0.06940874  0.11705765  0.21183582 -0.05769423  0.0261329
  0.12440711 -0.0382685  -0.13124184  0.09719379  0.00196476 -0.0585474
 -0.05670151 -0.05204291  0.03033609  0.01203046 -0.06344566  0.07659181
  0.02831528  0.01227162 -0.03398964 -0.33248156 -0.04068003  0.05831385
  0.14982454 -0.08452148 -0.05041637 -0.1549679   0.20063049  0.12990746
 -0.02160156 -0.22049788  0.18294705 -0.00647213 -0.1850161  -0.19923837
  0.08180607 -0.34828505 -0.10278995 -0.02946046 -0.2178

In [47]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.5.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.5.tar.gz


ERROR: Could not install packages due to an OSError: HTTPSConnectionPool(host='github.com', port=443): Max retries exceeded with url: /explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.5.tar.gz (Caused by NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x000002B8D4544210>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))



In [None]:
# !python -m spacy download en_core_web_sm

Traceback (most recent call last):
  File "d:\CODING\PYTHON\PYTHON---3.11\Lib\site-packages\urllib3\connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\CODING\PYTHON\PYTHON---3.11\Lib\site-packages\urllib3\util\connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sinha\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 962, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno 11001] getaddrinfo failed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "d:\CODING\PYTHON\PYTHON---3.11\Lib\site-packages\urllib3\connectionpool.py", line 787, in urlop

In [49]:
input_arr = []
for item in X_train.values:
    doc = nlp(item)
    input_arr.append(doc.vector)

In [50]:
input_arr = np.array(input_arr)

In [51]:
input_arr.shape

(7986, 96)

In [52]:
input_test_arr = []
for item in X_test.values:
    doc = nlp(item)
    input_test_arr.append(doc.vector)

In [53]:
input_test_arr = np.array(input_test_arr)

In [54]:
input_test_arr.shape

(1997, 96)

In [55]:
from sklearn.naive_bayes import GaussianNB

In [56]:
gnb = GaussianNB()
gnb.fit(input_arr,y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [57]:
y_pred = gnb.predict(input_test_arr)
accuracy_score(y_test,y_pred)

0.6069103655483225