# Training Data Prepration 
Praparing final set of data for training

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
df = pd.read_csv('clean_enriched_data.csv')


Unnamed: 0,movie_title,available_globally,viewing_hours,release_month,runtime,runtime_category,genre,language,country,imdb_rating,...,lg_other,lg_spanish,lg_japanese,lg_french,ct_united_states,ct_other,ct_united_kingdom,ct_japan,ct_france,type_encoded
0,The Night Agent,1,812100000,3,30,Short,"Action, Drama, Thriller",English,United States,7.5,...,0,0,0,0,1,0,0,0,0,1
1,Ginny & Georgia,1,665100000,2,30,Short,"Comedy, Drama",English,United States,7.5,...,0,0,0,0,1,0,0,0,0,1
2,The Glory,1,622800000,12,30,Short,"Drama, Mystery, Thriller",Korean,South Korea,8.1,...,1,0,0,0,0,1,0,0,0,1
3,Wednesday,1,507700000,11,30,Short,"Comedy, Crime, Fantasy",English,United States,8.1,...,0,0,0,0,1,0,0,0,0,1
4,Queen Charlotte: A Bridgerton Story,1,503000000,5,30,Short,"Drama, History, Romance",English,United States,7.4,...,0,0,0,0,1,0,0,0,0,1


Removing actual columns of what were encoded in binary. 
Also, removing the runtime_category, as we already have the numerical values for actual runtime in minutes

In [5]:
df.drop(['genre', 'language', 'country', 'type', 'runtime_category'], axis=1, inplace=True)
df.head()

Unnamed: 0,movie_title,available_globally,viewing_hours,release_month,runtime,imdb_rating,total_seasons,gn_drama,gn_action,gn_fantasy,...,lg_other,lg_spanish,lg_japanese,lg_french,ct_united_states,ct_other,ct_united_kingdom,ct_japan,ct_france,type_encoded
0,The Night Agent,1,812100000,3,30,7.5,2.0,1,1,0,...,0,0,0,0,1,0,0,0,0,1
1,Ginny & Georgia,1,665100000,2,30,7.5,2.0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,The Glory,1,622800000,12,30,8.1,1.0,1,0,0,...,1,0,0,0,0,1,0,0,0,1
3,Wednesday,1,507700000,11,30,8.1,2.0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,Queen Charlotte: A Bridgerton Story,1,503000000,5,30,7.4,1.0,1,0,0,...,0,0,0,0,1,0,0,0,0,1


---

## TF-IDF Vectorization for Movie Titles

**Objective**: Convert movie titles into a numerical format for machine learning.

**Method**:
- Utilize `TfidfVectorizer` to generate a TF-IDF matrix.
- Fit the vectorizer on the `movie_title` data to compute term importance.
- Transform titles into TF-IDF feature vectors.

**Application**:
- Integrate TF-IDF features with the dataset for model training.
- Ensure new title data is vectorized similarly for predictions.

---


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_features=100)  # Limit to top 100 features for simplicity

# Fit and transform the 'movie_title' column
title_tfidf_matrix = tfidf_vect.fit_transform(df['movie_title'])

# Create a DataFrame from the TF-IDF matrix
title_tfidf_df = pd.DataFrame(title_tfidf_matrix.toarray(), columns=tfidf_vect.get_feature_names_out())

# Reset the index of your original DataFrame if necessary to ensure it aligns with the new one
df.reset_index(drop=True, inplace=True)

# Join the new DataFrame with the original DataFrame
df = pd.concat([df, title_tfidf_df], axis=1)


In [11]:
df.head()

Unnamed: 0,movie_title,available_globally,viewing_hours,release_month,runtime,imdb_rating,total_seasons,gn_drama,gn_action,gn_fantasy,...,up,us,war,we,who,wild,with,world,you,your
0,The Night Agent,1,812100000,3,30,7.5,2.0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ginny & Georgia,1,665100000,2,30,7.5,2.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The Glory,1,622800000,12,30,8.1,1.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Wednesday,1,507700000,11,30,8.1,2.0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Queen Charlotte: A Bridgerton Story,1,503000000,5,30,7.4,1.0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18214 entries, 0 to 18213
Columns: 147 entries, movie_title to your
dtypes: float64(102), int64(44), object(1)
memory usage: 20.4+ MB


In [13]:
df.to_csv('training_data.csv', index=False)