###### loading the dataset

In [17]:
import pandas as pd

# Load the dataset (assuming it's in CSV format)
df = pd.read_csv('IMDB Dataset.csv')

# Display the first few rows of the dataset
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


###### Text Cleaning

In [18]:
print(df.columns)
print(df.shape)
print(df.isnull().sum())
print(df.dtypes)


Index(['review', 'sentiment'], dtype='object')
(50000, 2)
review       0
sentiment    0
dtype: int64
review       object
sentiment    object
dtype: object


In [27]:
import re

# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the clean_text function to the review column
df['cleaned_review'] = df['review'].apply(clean_text)

# Display the first few cleaned reviews
print(df[['review', 'cleaned_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production the filming tech...  
2  i thought this was a wonderful way to spend ti...  
3  basically theres a family where a little boy j...  
4  petter matteis love in the time of money is a ...  


##### tokenizing the data

In [28]:
from nltk.tokenize import word_tokenize
import nltk

In [29]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
df['tokenized_review'] = df['cleaned_review'].apply(lambda x: word_tokenize(str(x)))

In [31]:
print(df[['cleaned_review', 'tokenized_review']].head())

                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                    tokenized_review  
0  [one, of, the, other, reviewers, has, mentione...  
1  [a, wonderful, little, production, the, filmin...  
2  [i, thought, this, was, a, wonderful, way, to,...  
3  [basically, theres, a, family, where, a, littl...  
4  [petter, matteis, love, in, the, time, of, mon...  


In [32]:
from nltk.corpus import stopwords

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
stop_words = set(stopwords.words('english'))

In [35]:
def remove_stopwords(tokens):
    return[word for word in tokens if word not in stop_words]

In [36]:
df['cleaned_tokens'] = df['tokenized_review'].apply(remove_stopwords)

In [37]:
print(df[['tokenized_review', 'cleaned_tokens']].head())

                                    tokenized_review  \
0  [one, of, the, other, reviewers, has, mentione...   
1  [a, wonderful, little, production, the, filmin...   
2  [i, thought, this, was, a, wonderful, way, to,...   
3  [basically, theres, a, family, where, a, littl...   
4  [petter, matteis, love, in, the, time, of, mon...   

                                      cleaned_tokens  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, theres, family, little, boy, jake,...  
4  [petter, matteis, love, time, money, visually,...  


In [38]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download the lemmatizer data from NLTK
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the part of speech for lemmatization
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to apply lemmatization
def lemmatize_tokens(tokens):
    pos_tags = nltk.pos_tag(tokens)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)) for token, pos_tag in pos_tags]

# Apply the function to lemmatize tokens
df['lemmatized_tokens'] = df['cleaned_tokens'].apply(lemmatize_tokens)

# Display the first few lemmatized tokens
print(df[['cleaned_tokens', 'lemmatized_tokens']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                      cleaned_tokens  \
0  [one, reviewers, mentioned, watching, oz, epis...   
1  [wonderful, little, production, filming, techn...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, theres, family, little, boy, jake,...   
4  [petter, matteis, love, time, money, visually,...   

                                   lemmatized_tokens  
0  [one, reviewer, mention, watch, oz, episode, y...  
1  [wonderful, little, production, film, techniqu...  
2  [think, wonderful, way, spend, time, hot, summ...  
3  [basically, there, family, little, boy, jake, ...  
4  [petter, matteis, love, time, money, visually,...  


In [40]:
df.columns

Index(['review', 'sentiment', 'cleaned_review', 'tokenized_review',
       'cleaned_tokens', 'lemmatized_tokens'],
      dtype='object')

In [41]:
# Combine lemmatized tokens into a single string per document
df['lemmatized_text'] = df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))

# Initialize TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the lemmatized text
X = tfidf_vectorizer.fit_transform(df['lemmatized_text'])

# Display the shape of the TF-IDF matrix
print(X.shape)


(50000, 203322)


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Ensure 'sentiment' is numerical (if it's not already)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split data into features and labels
X = tfidf_vectorizer.transform(df['lemmatized_text'])
y = df['sentiment']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Display performance metrics
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.87      0.89      5044
           1       0.87      0.91      0.89      4956

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [43]:
import pickle

# Save the model and vectorizer
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)
