In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("movies_with_summary.csv")

In [2]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
Title            0
Year             0
Summary          5
Short Summary    1
Runtime          0
Rating           0
Movie Poster     0
dtype: int64


In [3]:
# Drop rows with missing values in specific columns
df.dropna(subset=['Summary'], inplace=True)
df.dropna(subset=['Short Summary'], inplace=True)

In [4]:
# Drop duplicates (if needed)
df.drop_duplicates(inplace=True)

In [5]:
df['Summary'][0]

'Patton Oswald, despite a personal tragedy, produces his best standup yet. Focusing on the tribulations of the Trump era and life after the loss of a loved one, Patton Oswald continues his journey to contribute joy to the world.'

In [6]:
# Clean text data (if needed)
# Example:
# Remove special characters, punctuation, and unwanted symbols from the 'summary' column
# You can define a custom function for cleaning or use regular expressions
# For example, to remove special characters and punctuation:
df['Summary'] = df['Summary'].str.replace('[^\w\s]', '')

In [7]:
# Normalize text (if needed)
# Convert text to lowercase
# Example:
df['Summary'] = df['Summary'].str.lower()

In [8]:
df['Summary'][0]

'patton oswald, despite a personal tragedy, produces his best standup yet. focusing on the tribulations of the trump era and life after the loss of a loved one, patton oswald continues his journey to contribute joy to the world.'

In [9]:
df = df[df['Runtime'] > 0]

In [10]:
# Display the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
                                               Title  Year  \
0                        Patton Oswalt: Annihilation  2017   
1                                      New York Doll  2005   
2  Mickey's Magical Christmas: Snowed in at the H...  2001   
4                                      And Then I Go  2017   
5                           An Extremely Goofy Movie  2000   

                                             Summary  \
0  patton oswald, despite a personal tragedy, pro...   
1  a recovering alcoholic and recently converted ...   
2  after everyone is snowed in at the house of mo...   
4  in the cruel world of junior high, edwin suffe...   
5  it's a big time in max's life. he's college bo...   

                                       Short Summary  Runtime  Rating  \
0  Patton Oswalt, despite a personal tragedy, pro...       66     7.4   
1  A recovering alcoholic and recently converted ...       75     7.9   
2  Mickey and all his friends hold their own Chri...  

In [11]:
import re

# Function to tokenize text using regular expressions
def tokenize(text):
    # Define pattern to match words
    pattern = r'\b\w+\b'

    # Tokenize text using pattern
    tokens = re.findall(pattern, text.lower())

    return tokens

# Apply tokenization to the 'Summary' column
df['tokens'] = df['Summary'].apply(tokenize)

# Display the DataFrame with tokenized text
df['tokens'][0]


['patton',
 'oswald',
 'despite',
 'a',
 'personal',
 'tragedy',
 'produces',
 'his',
 'best',
 'standup',
 'yet',
 'focusing',
 'on',
 'the',
 'tribulations',
 'of',
 'the',
 'trump',
 'era',
 'and',
 'life',
 'after',
 'the',
 'loss',
 'of',
 'a',
 'loved',
 'one',
 'patton',
 'oswald',
 'continues',
 'his',
 'journey',
 'to',
 'contribute',
 'joy',
 'to',
 'the',
 'world']

In [12]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def calculate_sentiment_polarity(tokens):
    # Join tokens back into a single string
    text = ' '.join(tokens)

    # Use TextBlob to calculate sentiment polarity
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity

    return polarity

# Apply sentiment analysis to the 'tokens' column
df['sentiment_polarity'] = df['tokens'].apply(calculate_sentiment_polarity)

# Categorize sentiment based on polarity
def categorize_sentiment(polarity):
    if polarity > 0:
        return 'happy'
    elif polarity < 0:
        return 'sad'
    else:
        return 'neutral'

# Apply sentiment categorization
df['sentiment'] = df['sentiment_polarity'].apply(categorize_sentiment)

# Display the dataset with sentiment analysis results
df.head()


Unnamed: 0,Title,Year,Summary,Short Summary,Runtime,Rating,Movie Poster,tokens,sentiment_polarity,sentiment
0,Patton Oswalt: Annihilation,2017,"patton oswald, despite a personal tragedy, pro...","Patton Oswalt, despite a personal tragedy, pro...",66,7.4,https://hydramovies.com/wp-content/uploads/201...,"[patton, oswald, despite, a, personal, tragedy...",0.625,happy
1,New York Doll,2005,a recovering alcoholic and recently converted ...,A recovering alcoholic and recently converted ...,75,7.9,https://hydramovies.com/wp-content/uploads/201...,"[a, recovering, alcoholic, and, recently, conv...",-0.037879,sad
2,Mickey's Magical Christmas: Snowed in at the H...,2001,after everyone is snowed in at the house of mo...,Mickey and all his friends hold their own Chri...,65,6.8,https://hydramovies.com/wp-content/uploads/201...,"[after, everyone, is, snowed, in, at, the, hou...",0.633333,happy
4,And Then I Go,2017,"in the cruel world of junior high, edwin suffe...","In the cruel world of junior high, Edwin suffe...",99,7.6,https://hydramovies.com/wp-content/uploads/201...,"[in, the, cruel, world, of, junior, high, edwi...",-0.226667,sad
5,An Extremely Goofy Movie,2000,it's a big time in max's life. he's college bo...,"Max goes to college, but to his embarassment h...",79,6.4,https://hydramovies.com/wp-content/uploads/201...,"[it, s, a, big, time, in, max, s, life, he, s,...",0.136364,happy


In [13]:
# @title Rating vs sentiment_polarity

# from matplotlib import pyplot as plt
# df.plot(kind='scatter', x='Rating', y='sentiment_polarity', s=32, alpha=.8)
# plt.gca().spines[['top', 'right',]].set_visible(False)

In [14]:
# @title Runtime vs Rating

# from matplotlib import pyplot as plt
# df.plot(kind='scatter', x='Runtime', y='Rating', s=32, alpha=.8)
# plt.gca().spines[['top', 'right',]].set_visible(False)

In [16]:
df.to_csv("cleanedMovies.csv", index=False)

In [15]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("sbcBI/sentiment_analysis")
# model = AutoModelForSequenceClassification.from_pretrained("sbcBI/sentiment_analysis")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [None]:
# import torch

# # Iterate through each movie summary
# for index, row in df.iterrows():
#     summary = row['Summary']

#     # Tokenize the summary
#     inputs = tokenizer(summary, return_tensors="pt", padding=True, truncation=True)

#     # Perform inference
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Get predicted sentiment
#     predicted_class = torch.argmax(outputs.logits, dim=1).item()

#     # Map predicted class to sentiment label
#     sentiment_label = {0: 'happy', 1: 'sad', 2: 'neutral'}[predicted_class]

#     # Update dataframe with sentiment label
#     df.at[index, 'sentiment'] = sentiment_label


In [None]:
df

In [None]:
# import pandas as pd
# import torch
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
# from torch.utils.data import DataLoader, TensorDataset
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm

# # Convert sentiment labels to numerical labels
# label_map = {"happy": 0, "sad": 1, "neutral": 2}
# df['Numerical_Label'] = df['sentiment'].map(label_map)

# # Split the dataset into training and validation sets
# train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# # Load BERT tokenizer and convert summaries to input IDs
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# train_input_ids = tokenizer(train_df['Summary'].tolist(), padding=True, truncation=True, return_tensors="pt")
# val_input_ids = tokenizer(val_df['Summary'].tolist(), padding=True, truncation=True, return_tensors="pt")

# # Reduce batch size
# batch_size = 16

# # Create DataLoader for training and validation sets with reduced batch size
# train_data = TensorDataset(train_input_ids.input_ids, torch.tensor(train_df['Numerical_Label'].tolist()))
# train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# val_data = TensorDataset(val_input_ids.input_ids, torch.tensor(val_df['Numerical_Label'].tolist()))
# val_dataloader = DataLoader(val_data, batch_size=batch_size)

# # Load pre-trained BERT model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Update num_labels to 3

# # Fine-tune the BERT model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
# epochs = 3

# for epoch in range(epochs):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}', unit='batches'):
#         batch = tuple(t.to(device) for t in batch)
#         inputs, labels = batch
#         optimizer.zero_grad()
#         outputs = model(inputs, labels=labels)
#         loss = outputs.loss
#         total_loss += loss.item()
#         loss.backward()
#         optimizer.step()

#     avg_train_loss = total_loss / len(train_dataloader)

#     # Evaluate the model on the validation set
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for batch in val_dataloader:
#             batch = tuple(t.to(device) for t in batch)
#             inputs, labels = batch
#             outputs = model(inputs, labels=labels)
#             loss = outputs.loss
#             val_loss += loss.item()

#     avg_val_loss = val_loss / len(val_dataloader)
#     print(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}")

# # Perform inference using the fine-tuned model
# # Your code for inference goes here
