In [46]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
import copy

In [50]:
# Load only relevant columns, TV shows from USA
netflix_data = pd.read_csv('netflix_titles.csv', usecols=['type', 'director', 'country', 'rating', 'duration'])
tv_shows = netflix_data[(netflix_data['type'] == 'TV Show') & (netflix_data['country'] == 'United States')].drop('type', axis=1)

# Convert 'duration' to numeric values, handling "Seasons" separately
tv_shows['duration'] = tv_shows['duration'].apply(lambda x: int(x.split()[0]) if 'Season' in x else None)
tv_shows = tv_shows.dropna(subset=['duration'])

# Initialize the LabelEncoder
rating_encoder = LabelEncoder()
director_encoder = LabelEncoder()
tv_shows['rating'] = rating_encoder.fit_transform(tv_shows['rating'])
tv_shows['director'] = director_encoder.fit_transform(tv_shows['director'])

# Create dictionaries for original encodings
rating_encoding = {original: encoded for original, encoded in zip(rating_encoder.classes_, range(len(rating_encoder.classes_)))}
director_encoding = {original: encoded for original, encoded in zip(director_encoder.classes_, range(len(director_encoder.classes_)))}

# Display the original encodings
print("Rating Encoding:", rating_encoding)
print("Director Encoding:", director_encoding)

# Print the final DataFrame for verification (optional)
print(tv_shows.head())

Rating Encoding: {'NR': 0, 'TV-14': 1, 'TV-G': 2, 'TV-MA': 3, 'TV-PG': 4, 'TV-Y': 5, 'TV-Y7': 6}
Director Encoding: {'Alex Gibney': 0, 'Alexx Media': 1, 'Billy Corben': 2, 'Dan Forrer': 3, 'Daniel Minahan': 4, 'Danny Cannon': 5, 'David Schalko': 6, 'Eli Roth': 7, 'Eric Abrams': 8, 'Eric Goode, Rebecca Chaiklin': 9, 'Everardo Gout': 10, 'Glen Winter': 11, 'Glenn Weiss': 12, 'James Bamford': 13, 'Jared Hess, Tyler Measom': 14, 'Jay Chandrasekhar': 15, 'Jerry Seinfeld': 16, 'Jesse Moss': 17, 'Jesse Warn': 18, 'Jill Bauer, Ronna Gradus, Rashida Jones': 19, 'Joe Berlinger': 20, 'Joe Berlinger, Bruce Sinofsky': 21, 'Joel Gallen, Tig Notaro': 22, 'Joshua Zeman': 23, 'Julia Willoughby Nason, Jenner Furst': 24, 'Ken Burns': 25, 'Ken Burns, Lynn Novick': 26, 'Kenny Ortega': 27, 'Lee Toland Krieger': 28, 'Lynn Novick': 29, 'Marcus Raboy': 30, 'Michael Simon': 31, 'Oliver Stone': 32, 'Oscar Micheaux, Spencer Williams, Richard E. Norman, Richard Maurice': 33, 'Peter McDonnell': 34, 'Rob Seidenglanz

In [44]:
def load_netflix(tv_shows):
    # fetch dataset
    features = [ 'rating', 'director']
    X = tv_shows[features]
    y = tv_shows['duration']

    # with this random seed, no null value is included in the test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train = copy.deepcopy(X_train).reset_index(drop=True)
    X_test = copy.deepcopy(X_test).reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    return X_train, X_test, y_train, y_test

# first impute the data and make it hypothetically clean
def load_netflix_cleaned(tv_shows):
    # fetch dataset
    features = [ 'rating', 'director']
    X = tv_shows[features]
    y = tv_shows['duration']

    # assumed gt imputation
    imputer = KNNImputer(n_neighbors=10)
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Split the data into training and testing sets with a random seed
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train = copy.deepcopy(X_train).reset_index(drop=True)
    X_test = copy.deepcopy(X_test).reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = load_netflix_cleaned()

In [52]:
X_train

Unnamed: 0,rating,director
0,5,45
1,3,45
2,5,45
3,1,45
4,1,45
...,...,...
603,3,45
604,1,45
605,3,45
606,6,45
