In [3]:
#! conda create -n py_econ
#! conda activate py_econ
#! pip install beautifulsoup4 selenium kaggle


In [4]:
# ! kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
imdb = pd.read_csv("../IMDB Dataset.csv")

In [10]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
imdb.info

<bound method DataFrame.info of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [13]:
imdb['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [14]:
imdb['review_length'] = imdb['review'].apply(len)
imdb['review_length'].describe()

count    50000.000000
mean      1309.431020
std        989.728014
min         32.000000
25%        699.000000
50%        970.000000
75%       1590.250000
max      13704.000000
Name: review_length, dtype: float64

In [15]:
imdb.head()

Unnamed: 0,review,sentiment,review_length
0,One of the other reviewers has mentioned that ...,positive,1761
1,A wonderful little production. <br /><br />The...,positive,998
2,I thought this was a wonderful way to spend ti...,positive,926
3,Basically there's a family where a little boy ...,negative,748
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1317


In [16]:

print(imdb.columns)

imdb = imdb[['review', 'sentiment']]


Index(['review', 'sentiment', 'review_length'], dtype='object')


## Handling missing values


In [17]:
# Check for any missing values
missing_values = imdb.isnull().sum()
print(missing_values)

review       0
sentiment    0
dtype: int64


In [18]:
imdb = imdb.dropna()

In [19]:
print(imdb.info())
print(f"Dataset now has {imdb.shape[0]} rows and {imdb.shape[1]} columns.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
Dataset now has 50000 rows and 2 columns.


In [20]:
import re

imdb['review'] = imdb['review'].apply(lambda x: re.sub(r'<.*?>', '', x))


In [22]:

imdb['review'] = imdb['review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.,!?]', '', x))

imdb['review'] = imdb['review'].apply(lambda x: x.lower())

imdb['review'] = imdb['review'].apply(lambda x: ' '.join(x.split()))


In [23]:
imdb.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [24]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Download resources if not already installed
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prtimilsina/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prtimilsina/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prtimilsina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
nltk.data.path.append("/Users/prtimilsina/nltk_data")

In [31]:
nltk.download('punkt', download_dir='/Users/prtimilsina/nltk_data')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prtimilsina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
imdb.to_csv("cleaned_imdb_reviews.csv", index=False)

## Summary Report

IMDb Movie Reviews Data Analysis
Objective: The project aims to acquire, clean, and prepare IMDb movie review data for further analysis, such as understanding sentiment distribution and exploring review text characteristics.

## 1. Data Acquisition
Dataset Source: The dataset was sourced from Kaggle’s IMDb dataset of 50,000 movie reviews, which includes both positive and negative sentiments.
Format: The data was initially downloaded as a ZIP file containing a CSV file (IMDB Dataset.csv).
Loading Data: The CSV file was loaded into a pandas DataFrame to allow for structured data manipulation.
## 2. Data Cleaning
Data cleaning was essential to ensure the quality and usability of the dataset for further analysis. The following steps were taken:

### Removing Irrelevant Data: 
Any unnecessary columns or extraneous data were removed to retain only essential columns: review and sentiment.

### Handling Empty Reviews:
Empty reviews, including those containing only whitespace, were identified and removed to avoid issues during analysis.
### Removing Duplicates:
Duplicate reviews were removed to prevent repetitive data from skewing analysis results.
Removing HTML Tags and Special Characters: HTML tags (such as <br />) and non-alphanumeric characters were stripped from the text to ensure cleaner and more readable reviews.
## 3. Data Transformation
Several transformations were applied to enhance the dataset for analysis:

### Text Preprocessing (Tokenization and Stemming): 
Tokenization and stemming were initially attempted but encountered technical issues. These steps were deferred for future improvement.
### Adding Review Length: 
A new column, review_length, was added to capture the length of each review. This allowed for additional exploration of text characteristics, such as review length distribution.
## 4. Storing Cleaned Data
Saving Cleaned Data: The cleaned dataset was saved as cleaned_imdb_reviews.csv to preserve the processed data for further analysis and to avoid re-running cleaning steps.
Summary
The steps in this project allowed for effective data acquisition, cleaning, and transformation of IMDb movie reviews. This prepared dataset can now be used for further analysis, including sentiment distribution and deeper text analysis. Future work may involve implementing tokenization and stemming, as well as building machine learning models to classify sentiment or explore review topics.