# Feature Engineerining and Preprocessing 

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df = pd.read_csv('Cleaned_Data_imdb.csv')

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [29]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [30]:
df.head()

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [31]:
#     For high cardinality categorical columns, replace values that appear less than 'threshold' times with 'Other'.

In [32]:
df['Certificate'] = df['Certificate'].astype('category')
def reduce_cardinality(col, threshold=10):
    counts = col.value_counts()
    return col.apply(lambda x: x if counts[x] >= threshold else 'Other')
for col in ['Director', 'Star1', 'Star2', 'Star3', 'Star4']:
    df[col] = reduce_cardinality(df[col], threshold=10)
    df[col] = df[col].astype('category')

In [33]:
feature_cols= df.columns.tolist()

In [34]:
feature_cols

['Released_Year',
 'Certificate',
 'Runtime',
 'Genre',
 'IMDB_Rating',
 'Meta_score',
 'Director',
 'Star1',
 'Star2',
 'Star3',
 'Star4',
 'No_of_Votes',
 'Gross']

In [35]:
df['Movie_Age'] = 2025 - df['Released_Year']

In [36]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
#   Process the Genre column:
#   Split genres by a delimiter (e.g., '|').
#   Keep only the top_n most frequent genres, and replace others with 'Other'.
#   Create binary dummy (one-hot) features for the processed genres.

In [37]:
def process_genres(df, col='Genre', delimiter=',', top_n=10):

#Split genres into lists
    df['Genre_list'] = df[col].apply(lambda x: [g.strip() for g in x.split(delimiter)] if isinstance(x, str) else [])
#Get overall frequency of each genre
    all_genres = pd.Series([genre for sublist in df['Genre_list'] for genre in sublist])
    top_genres = all_genres.value_counts().head(top_n).index.tolist()
#Replace non-top genres with 'Other'
    df['Genre_list_processed'] = df['Genre_list'].apply(
        lambda genres: [g if g in top_genres else 'Other' for g in genres]
    )
#Use MultiLabelBinarizer to create dummy variables
    mlb = MultiLabelBinarizer()
    genre_dummies = pd.DataFrame(mlb.fit_transform(df['Genre_list_processed']),
                                 columns=[f"Genre_{g}" for g in mlb.classes_],
                                 index=df.index)
#Append genre dummies and drop intermediate columns
    df = pd.concat([df, genre_dummies], axis=1)
    df.drop(columns=[col, 'Genre_list', 'Genre_list_processed'], inplace=True)
    return df

In [38]:
df.head()

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,Movie_Age
0,1994,A,142,Drama,9.3,80.0,Other,Other,Other,Other,Other,2343110,28341469.0,31
1,1972,A,175,"Crime, Drama",9.2,100.0,Other,Other,Other,Other,Other,1620367,134966411.0,53
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Other,Other,Other,Other,Other,2303232,534858444.0,17
3,1974,A,202,"Crime, Drama",9.0,90.0,Other,Al Pacino,Other,Other,Other,1129952,57300000.0,51
4,1957,U,96,"Crime, Drama",9.0,96.0,Other,Other,Other,Other,Other,689845,4360000.0,68


In [39]:
df = process_genres(df, col='Genre', delimiter=',', top_n=10)

In [41]:
df.head()

Unnamed: 0,Released_Year,Certificate,Runtime,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,...,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Mystery,Genre_Other,Genre_Romance,Genre_Thriller
0,1994,A,142,9.3,80.0,Other,Other,Other,Other,Other,...,0,0,0,0,0,1,0,0,0,0
1,1972,A,175,9.2,100.0,Other,Other,Other,Other,Other,...,0,0,0,0,1,1,0,0,0,0
2,2008,UA,152,9.0,84.0,Other,Other,Other,Other,Other,...,0,0,0,0,1,1,0,0,0,0
3,1974,A,202,9.0,90.0,Other,Al Pacino,Other,Other,Other,...,0,0,0,0,1,1,0,0,0,0
4,1957,U,96,9.0,96.0,Other,Other,Other,Other,Other,...,0,0,0,0,1,1,0,0,0,0


In [42]:
df.columns

Index(['Released_Year', 'Certificate', 'Runtime', 'IMDB_Rating', 'Meta_score',
       'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross',
       'Movie_Age', 'Genre_Action', 'Genre_Adventure', 'Genre_Animation',
       'Genre_Biography', 'Genre_Comedy', 'Genre_Crime', 'Genre_Drama',
       'Genre_Mystery', 'Genre_Other', 'Genre_Romance', 'Genre_Thriller'],
      dtype='object')

In [43]:
categorical_cols = ['Certificate', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']
df_model = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [44]:
df_model.head()

Unnamed: 0,Released_Year,Runtime,IMDB_Rating,Meta_score,No_of_Votes,Gross,Movie_Age,Genre_Action,Genre_Adventure,Genre_Animation,...,Certificate_Unrated,Director_Alfred Hitchcock,Director_Hayao Miyazaki,Director_Martin Scorsese,Director_Other,Director_Steven Spielberg,Star1_Clint Eastwood,Star1_Other,Star1_Robert De Niro,Star1_Tom Hanks
0,1994,142,9.3,80.0,2343110,28341469.0,31,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,1972,175,9.2,100.0,1620367,134966411.0,53,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,2008,152,9.0,84.0,2303232,534858444.0,17,1,0,0,...,0,0,0,0,1,0,0,1,0,0
3,1974,202,9.0,90.0,1129952,57300000.0,51,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1957,96,9.0,96.0,689845,4360000.0,68,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [46]:
df_model.to_csv('Cleaned_engineered_data_IMDB.csv')

In [47]:
df_model.shape

(999, 42)

In [48]:
df_model.columns

Index(['Released_Year', 'Runtime', 'IMDB_Rating', 'Meta_score', 'No_of_Votes',
       'Gross', 'Movie_Age', 'Genre_Action', 'Genre_Adventure',
       'Genre_Animation', 'Genre_Biography', 'Genre_Comedy', 'Genre_Crime',
       'Genre_Drama', 'Genre_Mystery', 'Genre_Other', 'Genre_Romance',
       'Genre_Thriller', 'Certificate_A', 'Certificate_Approved',
       'Certificate_G', 'Certificate_GP', 'Certificate_PG',
       'Certificate_PG-13', 'Certificate_Passed', 'Certificate_R',
       'Certificate_TV-14', 'Certificate_TV-MA', 'Certificate_TV-PG',
       'Certificate_U', 'Certificate_U/A', 'Certificate_UA',
       'Certificate_Unrated', 'Director_Alfred Hitchcock',
       'Director_Hayao Miyazaki', 'Director_Martin Scorsese', 'Director_Other',
       'Director_Steven Spielberg', 'Star1_Clint Eastwood', 'Star1_Other',
       'Star1_Robert De Niro', 'Star1_Tom Hanks'],
      dtype='object')