In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load dataset
file_path = './Top_5000_Movies_IMDb.csv'
print("Loading the dataset...")
data = pd.read_csv(file_path)
print(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns.")

# Remove duplicates based on 'ID' column
print("Removing duplicate rows based on 'ID' column...")
data = data.drop_duplicates(subset=['ID'])
print(f"Dataset after removing duplicates: {data.shape[0]} rows.")

# Extract runtime in minutes and clean it
print("Cleaning 'Runtime' column...")
data['Runtime'] = data['Runtime'].str.replace(' min', '', regex=False).astype(float, errors='ignore')

# Normalize numerical columns
print("Normalizing numerical columns: 'Rating', 'Votes', and 'Gross'...")
numerical_cols = ['Rating', 'Votes', 'Gross']
scaler = MinMaxScaler()
data[numerical_cols] = data[numerical_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Preprocess the 'Plot' column for text analysis
print("Cleaning and preprocessing the 'Plot' column for text analysis...")

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        # Tokenize text
        tokens = word_tokenize(text.lower())
        # Remove stopwords and non-alphabetic tokens
        filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        # Lemmatize tokens
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
        return " ".join(lemmatized_tokens)
    return ""

data['Cleaned_Plot'] = data['Plot'].apply(preprocess_text)
print("Text preprocessing completed for 'Plot'.")

# Save the final preprocessed data
final_file_path = './Preprocessed_Movies_IMDb.csv'
print("Saving the preprocessed dataset...")
data.to_csv(final_file_path, index=False)
print(f"Preprocessed data saved to: {final_file_path}")

# Print a sample of the cleaned dataset
print("Displaying a sample of the preprocessed dataset:")
print(data.head())

Loading the dataset...
Dataset loaded with 4999 rows and 12 columns.
Removing duplicate rows based on 'ID' column...
Dataset after removing duplicates: 4999 rows.
Cleaning 'Runtime' column...
Normalizing numerical columns: 'Rating', 'Votes', and 'Gross'...
Cleaning and preprocessing the 'Plot' column for text analysis...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guled\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guled\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guled\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text preprocessing completed for 'Plot'.
Saving the preprocessed dataset...
Preprocessed data saved to: ./Preprocessed_Movies_IMDb.csv
Displaying a sample of the preprocessed dataset:
   ID                           Movie Name    Rating  Runtime  \
0   1             The Shawshank Redemption  1.000000    142.0   
1   2                        The Godfather  0.973684    175.0   
2   3  Ramayana: The Legend of Prince Rama  0.973684    135.0   
3   4                      The Chaos Class  0.973684     87.0   
4   5                                Daman  0.947368    121.0   

                          Genre  Metascore  \
0                         Drama       82.0   
1                  Crime, Drama      100.0   
2  Animation, Action, Adventure        NaN   
3                 Comedy, Drama        NaN   
4              Adventure, Drama        NaN   

                                                Plot  \
0  Over the course of several years, two convicts...   
1  Don Vito Corleone, head of a mafi

Loading the dataset...
Dataset loaded successfully with 8560 rows and 8 columns.
Columns in the dataset: ['Unnamed: 0', 'id', 'title', 'overview', 'release_date', 'popularity', 'vote_average', 'vote_count']
Removing duplicate rows based on 'id' column...
Dataset after removing duplicates: 8559 rows.
Converting 'release_date' to datetime...
Normalizing numerical columns: 'popularity', 'vote_average', 'vote_count'...
Cleaning and preprocessing the 'overview' column for text analysis...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guled\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guled\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guled\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text preprocessing completed for 'overview'.
Saving the preprocessed dataset...
Preprocessed data saved to: C:\Users\guled\movie_recommender\final_preprocessed_movie.csv
Verifying the preprocessed dataset...
Final dataset contains 8559 rows and 9 columns.
Columns in the final dataset: ['Unnamed: 0', 'id', 'title', 'overview', 'release_date', 'popularity', 'vote_average', 'vote_count', 'cleaned_overview']


NameError: name 'final_preprocessed_data' is not defined