# Data Cleaning, Feature Extraction, and Victorization 

## Installing necesary libraries and modules 

In [1]:
!pip install pandas



## Data Exploration and Cleaning

In [2]:
import pandas as pd

df = pd.read_csv("tmdb_5000_credits.csv")

df.head(5)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [3]:
df.isnull().sum()

movie_id    0
title       0
cast        0
crew        0
dtype: int64

In [4]:
df.shape

(4803, 4)

In [5]:
duplicate = df[df.duplicated('crew')]
 
print("Duplicate Rows :")
 
# Print the resultant Dataframe
duplicate

Duplicate Rows :


Unnamed: 0,movie_id,title,cast,crew
3670,447027,Running Forever,[],[]
3977,55831,Boynton Beach Club,"[{""cast_id"": 1, ""character"": ""Marilyn"", ""credi...",[]
4068,371085,Sharkskin,[],[]
4105,48382,"The Book of Mormon Movie, Volume 1: The Journey","[{""cast_id"": 1, ""character"": ""Sam"", ""credit_id...",[]
4118,325140,Hum To Mohabbat Karega,[],[]
4123,20653,Roadside Romeo,"[{""cast_id"": 1, ""character"": ""Romeo"", ""credit_...",[]
4247,361505,Me You and Five Bucks,[],[]
4305,114065,Down & Out With The Dolls,[],[]
4314,137955,Crowsnest,[],[]
4322,102840,Sex With Strangers,[],[]


In [6]:
df = df[~((df['cast'] == '[]') & (df['crew'] == '[]'))]

# Display the filtered DataFrame
df

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...
4798,9367,El Mariachi,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4799,72766,Newlyweds,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4800,231617,"Signed, Sealed, Delivered","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4801,126186,Shanghai Calling,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


## Feature exraction 

### Explanation:
- Parsing JSON Strings:

The parse_json function converts JSON strings into Python objects using ast.literal_eval.
This function is applied to both the cast and crew columns.


- Extracting Names:

get_main_actors function extracts the names of the first three actors from the cast list.
get_director function extracts the director's name from the crew list.


- Combining Features:

The main_cast and director columns are combined into a single string for each movie.

In [7]:
# Feature extraction 
import ast
# Function to parse JSON strings
def parse_json(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

# Apply the function to parse the cast and crew columns
df['cast'] = df['cast'].apply(parse_json)
df['crew'] = df['crew'].apply(parse_json)

# Function to extract main actors' names
def get_main_actors(cast):
    if isinstance(cast, list):
        names = [member['name'] for member in cast[:3]]  # Get the first 3 actors
        return ' '.join(names)
    return ''

# Function to extract director's name
def get_director(crew):
    if isinstance(crew, list):
        for member in crew:
            if member['job'] == 'Director':
                return member['name']
    return ''

# Apply the functions to extract features
df['main_cast'] = df['cast'].apply(get_main_actors)
df['director'] = df['crew'].apply(get_director)

# Debug: Print extracted features
print(df[['title', 'main_cast', 'director']])

# Combine features into a single string
df['combined_features'] = df['main_cast'] + ' ' + df['director']

# Verify the combined_features column
print(df[['title', 'combined_features']])

# Remove rows with empty combined_features
df = df[df['combined_features'].str.strip() != '']

# Verify the combined_features column again
print(df[['title', 'combined_features']])

                                         title  \
0                                       Avatar   
1     Pirates of the Caribbean: At World's End   
2                                      Spectre   
3                        The Dark Knight Rises   
4                                  John Carter   
...                                        ...   
4798                               El Mariachi   
4799                                 Newlyweds   
4800                 Signed, Sealed, Delivered   
4801                          Shanghai Calling   
4802                         My Date with Drew   

                                           main_cast           director  
0       Sam Worthington Zoe Saldana Sigourney Weaver      James Cameron  
1          Johnny Depp Orlando Bloom Keira Knightley     Gore Verbinski  
2           Daniel Craig Christoph Waltz Léa Seydoux         Sam Mendes  
3           Christian Bale Michael Caine Gary Oldman  Christopher Nolan  
4         Taylor Kitsch Lynn 

## Vectorization 
The TfidfVectorizer from scikit-learn is used to convert the text data in the combined_features column into numerical vectors. This step transforms the text data into a form that can be used by machine learning algorithms.

In [8]:
# Step 2: Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Re-initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])


print(tfidf_matrix.shape)

(4782, 7954)


In [9]:
# saving the new preprocessed dataset 
df.to_csv('C:\Movie-Recommendation-System-main\processed_movies_dataset.csv', index=False)