In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("n_movies.csv")
df.head()

In [3]:
df.describe()

In [4]:
df.info()

In [5]:
df.isnull().sum()

In [6]:
# Iterate over each column and find the first row with null values
for column in df.columns:
    null_rows = df[df[column].isnull()]
    
    if not null_rows.empty:
        first_null_row = null_rows.iloc[0]
        row_number = df.index.get_loc(first_null_row.name) + 1
        title_of_movie = first_null_row['title']
        print(f"First movie without a value in '{column}' at row {row_number}: {title_of_movie}")
    else:
        print(f"No movie without a value in '{column}'")

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError

In [8]:
# Subset of data with non-null 'genre' and 'description'
df2 = df.dropna(subset=['genre', 'description'])

In [9]:
# Text processing and feature extraction
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df2['description'])
y_label_encoder = LabelEncoder()
y = y_label_encoder.fit_transform(df2['genre'])

In [10]:
# Train-test split for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a RandomForestClassifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Predict 'genre' for instances with missing values
missing_genre_df = df[df['genre'].isnull()]

In [None]:
# Extract names of movies with initially null 'genre'
movies_with_null_genre_names = missing_genre_df['title'].tolist()

In [None]:
# Check if 'genre' has any missing values before proceeding
if missing_genre_df['genre'].isnull().any():
    try:
        # Transform 'description' for missing values
        missing_genre_X = vectorizer.transform(missing_genre_df['description'])
        
        # Predict 'genre' for instances with missing values
        predicted_genre = classifier.predict(missing_genre_X)
        
        # Fill missing 'genre' values with predictions
        df.loc[df['genre'].isnull(), 'genre'] = y_label_encoder.inverse_transform(predicted_genre)
        
    except NotFittedError as e:
        print(f"Error: {e}. The vectorizer and label encoder must be fitted before making predictions.")
else:
    print("No missing values in 'genre'.")

In [None]:
filled_movies = df[df['genre'].notnull()]

print("Movies with Filled Genres:")
print("Movie\t\t\tGenre")
for index, row in filled_movies.iterrows():
    print(f"{row['title']}\t\t{row['genre']}")

In [None]:
print("\nMovies with Initially Null Genre:")
print("Movie\t\t\tGenre")
for movie_name in movies_with_null_genre_names:
    movie_row = df[df['title'] == movie_name].iloc[0]
    print(f"{movie_row['title']}\t\t{movie_row['genre']}")