In [None]:
import pandas as pd 
import numpy as np 
import opendatasets as od 
import matplotlib.pyplot as plt 
import seaborn as sns

# text preprocessing 
import re 
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
warnings.filterwarnings("ignore")

: 

In [None]:
# od.download('https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')

In [None]:
# import the dataset as pandas dataframe    
df = pd.read_csv('imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df.sample()

In [None]:
# shape of dataset 
print("Shape of the dataset:", df.shape)

In [None]:
# check for null values
df.isnull().sum()

In [None]:
# check for duplicates 
print("Number of duplicate rows:", df.duplicated().sum()    )

In [None]:
# drop duplicate 
df.drop_duplicates(inplace=True)

In [None]:
# shape of dataset 
print("New shape of the dataset after dropping duplicates:", df.shape)

In [None]:
# checking the distribution of target variable
print(df.sentiment.value_counts())

### The data seems to be balanced

In [None]:
# remove punctuation and special characters and emojis
def remove_punct(text):
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)      # remove digits
    text = re.sub(r'\s+', ' ', text)     # remove extra spaces
    text = re.sub(r'\n', ' ', text)      # remove new line characters
    text = re.sub(r'\t', ' ', text)      # remove tab characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # remove non-ASCII characters
    return text.strip()                  # remove leading/trailing spaces

# remove stopwords
def remove_stpwrd(text):
    stpWrd = stopwords.words('english')
    return [word for word in text if word not in stpWrd]

In [None]:
df.sample()

In [None]:
df['review'] = df['review'].str.lower()
df['review'] = df['review'].apply(remove_punct)
df['review'] = df['review'].apply(word_tokenize)
df['review'] = df['review'].apply(remove_stpwrd)

In [None]:
df.sample(5)

In the context of sentiment classification, both stemming and lemmatization can be helpful, but they address different needs. Stemming is faster and simpler, reducing words to their root form, while lemmatization considers the word's context and part of speech to produce a meaningful base form (lemma). For sentiment classification, lemmatization is generally preferred if you need more accurate results and can tolerate the slightly slower processing time. However, if speed is critical and some inaccuracies are acceptable, stemming might be a better option. 

**Stemming**:
- Process: A process of reducing words to their root form by removing suffixes (e.g., "running" becomes "run").
    - Pros: Faster and simpler to implement. 
    - Cons: Can produce non-words (e.g., "studies" might become "studi"). May not always result in meaningful base forms. 

**Lemmatization**:
- Process: Reduces words to their dictionary form (lemma), considering the word's part of speech (e.g., "better" becomes "good").
    - Pros: Produces more accurate and meaningful base forms.
    - Cons: More computationally expensive and slower than stemming. 

## Sentiment Analysis:
### Why lemmatization might be better:

- In sentiment analysi, you want to accurately identify and group related words. Lemmatization helps ensure that variations of a word are treated as the same (e.g., "activate" and "activated" are both reduced to "activate"). This can improve the model's ability to recognize positive or negative phrases and patterns. 

### When stemming might be sufficient:
- If your priority is speed and you are dealing with a large dataset where the differences between stemming and lemmatization might not be significant, stemming could be a good starting point. However, if you are seeing issues with accuracy, switching to lemmatization might be beneficial

In [None]:
# lets go for lemmatization 

def lemmatize_text(text):
    lemma = WordNetLemmatizer()
    return [lemma.lemmatize(word) for word in text]

df['review'] = df.review.apply(lemmatize_text)


In [None]:
print(df.iloc[100,:][0])

In [None]:
# message length distribution, with labels

plt.figure(figsize=(6, 3))
sns.histplot(data=df.iloc[:1000,:], x=df.review.apply(len), hue='sentiment', kde=True, bins=30)
plt.title('Review Length Distribution by Label')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()