In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
import statsmodels.api as sm
from scipy import stats
from scipy.stats import boxcox
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, precision_score, recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RepeatedKFold, cross_validate
import kerastuner as kt
from tensorflow import keras
from tensorflow.keras import layers

# Data Cleaning (KaggleMovies.csv)

df is going to have all of our data about each movies

In [None]:
# Read the dataset
df = pd.read_csv('Datasets/KaggleMovies.csv')
pd.set_option('float_format', '{:f}'.format)
pd.set_option('display.precision', 2)
df.head()

In [None]:
# Renaming columns
column_mapping = {
    'name' : 'Name',
    'rating' : 'Rating',
    'genre' : 'Genre',
    'year' : 'Year',
    'released' : 'Released',
    'score' : 'Score',
    'votes' : 'Votes',
    'director' : 'Director',
    'writer' : 'Writer',
    'star' : 'Star',
    'country' : 'Country',
    'budget' : 'Budget',
    'gross' : 'Gross Revenue',
    'company' : 'Company',
    'runtime' : 'Runtime'
}

df = df.rename(columns=column_mapping)

In [None]:
# Check for empty spaces
print(f"Shape: {df.shape}\n")
isna = df.isna().sum(axis=0)
isna

In [None]:
# Remove rows with null values
df = df.dropna()
print(f"Shape: {df.shape}\n")
print(df.isna().sum(axis=0))

In [None]:
# Convert certain columns to appropiate data types
df.loc[:,['Votes', 'Budget', 'Runtime', 'Gross Revenue']] = df[['Votes', 'Budget', 'Runtime', 'Gross Revenue']].astype('int')
df.loc[:,'Score'] = df['Score'].apply(lambda x: float("{:.2f}".format(x)))
df.head()

In [None]:
print(df.dtypes)

In [None]:
# Check number of unique movie names
len(df['Name'].unique())

In [None]:
#Drop duplicate movie names
df = df.drop_duplicates(subset=['Name'], keep='first')
print(f"Shape: {df.shape}\n")
df.head()

df now contains movies that are unique and have no empty values on any of its attributes

### Univariate Analysis (Numerical Variables)

In [None]:
df.describe()

In [None]:
#Univariate Analysis of Year
plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Year'], kde=True, stat='density')
plt.title('Distribution of Year')
plt.xlabel('Year')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Year'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Year')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()

plt.show()

Here is our graphs representing the univariate analysis of 'Year'

In [None]:
#Univariate Analysis of Votes
plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Votes'], kde=True, stat='density')
plt.title('Distribution of Votes')
plt.xlabel('Votes')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Votes'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Votes')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()

Here is our graphs representing the univariate analysis of 'Votes'

In [None]:
#Normalize the right skewed data
df['Votes'], lambda_votes = boxcox(df['Votes'])

plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Votes'], kde=True, stat='density')
plt.title('Distribution of Votes')
plt.xlabel('Votes')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Votes'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Votes')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()

Here is our graphs representing the univariate analysis of 'Votes' after normalizing our data

In [None]:
#Univariate Analysis of Gross Revenue
plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Gross Revenue'], kde=True, stat='density')
plt.title('Distribution of Gross Revenue')
plt.xlabel('Gross Revenue')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Gross Revenue'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Gross Revenue')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()

Here is our graphs representing the univariate analysis of 'Gross Revenue'

In [None]:
#Normalize the right skewed data
df['Gross Revenue'], lambda_gross = boxcox(df['Gross Revenue'])
plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Gross Revenue'], kde=True, stat='density')
plt.title('Distribution of Gross Revenue')
plt.xlabel('Gross Revenue')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Gross Revenue'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Gross Revenue')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()

Here is our graphs representing the univariate analysis of 'Gross Revenue' after normalizing it

In [None]:
#Univariate Analysis of Budget
plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Budget'], kde=True, stat='density')
plt.title('Distribution of Budget')
plt.xlabel('Budget')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Budget'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Budget')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()

Here is our graphs representing the univariate analysis of 'Budget'

In [None]:
#Univariate Analysis of Runtime
plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Runtime'], kde=True, stat='density')
plt.title('Distribution of Runtime')
plt.xlabel('Runtime')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Runtime'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Runtime')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()

Here is our graphs representing the univariate analysis of 'Runtime'

In [None]:
df['Budget'] = df['Budget'].apply(lambda x: 250000000 if x > 250000000 else x)

plt.figure(figsize=(14, 4))

#histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Budget'], kde=True, stat='density')
plt.title('Distribution of Budget')
plt.xlabel('Budget')
plt.ylabel('Density')
plt.show()

#Q-Q plot
plt.subplot(1, 2, 2)
stats.probplot(df['Budget'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Budget')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')

plt.tight_layout()
plt.show()


Here is our graphs representing the univariate analysis of 'Budget' after we cap the values above 250,000,000 to 250,000,000

### Univariate Analysis (Categorical Variables)

In [None]:
print("Unique Value Count of Categorical Variables")
print("Rating:", len(df['Rating'].unique()))
print("Genre:",len(df['Genre'].unique()))
print("Director:",len(df['Director'].unique()))
print("Writer:",len(df['Writer'].unique()))
print("Star:",len(df['Star'].unique()))
print("Country:",len(df['Country'].unique()))
print("Company:",len(df['Company'].unique()))

In [None]:
score_by_rating = df.groupby("Rating")["Score"].mean().sort_values(ascending=True)

plt.barh(score_by_rating.index, score_by_rating.values)
plt.xlabel('Mean Score')
plt.ylabel('Rating')
plt.title('Mean Score by Rating')

This horizontal bar chart displays the average scores for different content ratings of movies.

In [None]:
score_by_genre = df.groupby("Genre")["Score"].mean().sort_values(ascending=True)[len(df['Genre'].unique()) - 10 :]

plt.barh(score_by_genre.index, score_by_genre.values)
plt.xlabel('Mean Score')
plt.ylabel('Genre')
plt.title('Mean Score by Genre')

This horizontal bar chart displays the average scores for different movie genres.

In [None]:
count_by_genre = df.groupby("Genre")["Score"].count().sort_values(ascending=True)[len(df['Genre'].unique()) - 10 :]

plt.barh(count_by_genre.index, count_by_genre.values)
plt.xlabel('Count')
plt.ylabel('Genre')
plt.title('Count by Genre')

The bar chart illustrates the distribution across various genres of movies.

In [None]:
score_by_country = df.groupby("Country")["Score"].mean().sort_values(ascending=True)[len(df['Country'].unique()) - 10 :]

plt.barh(score_by_country.index, score_by_country.values)
plt.xlabel('Mean Score')
plt.ylabel('Country')
plt.title('Mean Score by Country')

This horizontal bar chart displays the average scores of movies released by various countries.

In [None]:
count_by_country = df.groupby("Country")["Score"].count().sort_values(ascending=True)[len(df['Country'].unique()) - 10 :]

plt.barh(count_by_country.index, count_by_country.values)
plt.xlabel('Count')
plt.ylabel('Country')
plt.title('Count by Country')

This horizontal bar chart displays the number of movies released by various countries.

In [None]:
score_by_company = df.groupby("Company")["Score"].mean().sort_values(ascending=True)[len(df['Company'].unique()) - 10 :]

plt.barh(score_by_company.index, score_by_company.values)
plt.xlabel('Mean Score')
plt.ylabel('Company')
plt.title('Mean Score by Company')

This horizontal bar chart displays the average score released by various companies ordered from highest to lowest.

In [None]:
score_by_company = df.groupby("Company")["Score"].mean().sort_values(ascending=False)[len(df['Company'].unique()) - 10 :]

plt.barh(score_by_company.index, score_by_company.values)
plt.xlabel('Mean Score')
plt.ylabel('Company')
plt.title('Mean Score by Company')

This horizontal bar chart displays the average score released by various companies ordered in lowest to highest.

In [None]:
score_by_company = df.groupby("Company")["Score"].count().sort_values(ascending=True)[len(df['Company'].unique()) - 10 :]

plt.barh(score_by_company.index, score_by_company.values)
plt.xlabel('Count')
plt.ylabel('Company')
plt.title('Count by Company')

This horizontal bar chart displays the total count released by various companies.

### Data Transformation


In [None]:
# Instantiating MinMaxScaler and StandardScaler
scaler_minmax = MinMaxScaler()
scaler_std = StandardScaler()

# Scaling the columns
df[['Budget']] = scaler_minmax.fit_transform(df[['Budget']])
df[['Votes', 'Gross Revenue', 'Runtime']] = scaler_std.fit_transform(df[['Votes', 'Gross Revenue', 'Runtime']])

# Performing one-hot encoding on 'Rating', 'Genre', and 'Country' columns
oneHot = pd.get_dummies(df[['Rating', 'Genre', 'Country']], columns=['Rating', 'Genre', 'Country'])
df = pd.concat([df, oneHot], axis=1)
df.head()


### Bivariate Analysis

In [None]:
sns.lineplot(data=df, x="Year", y="Score")

plt.title('Score Throughout the Years')
plt.xlabel('Years')
plt.ylabel('Score')

# Show the plot
plt.show()

This is our graph representing the Bivariate analysis of 'Score' throughout the years

In [None]:
correlation_matrix = df[["Year", 'Score', "Votes", "Budget", "Gross Revenue", "Runtime"]].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

This is our correlation matrix of attributes that have numerical values

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

df_BGR = df[['Budget', 'Gross Revenue']]
X_train, X_test, y_train, y_test = train_test_split(df_BGR.Budget, df_BGR.drop(['Budget'], axis=1), test_size=0.2, random_state=21)

train = pd.concat([X_train, y_train], axis=1)
train = train.sort_values(by=['Budget'])
budget = train[['Budget']]
grossRevenue = train[['Gross Revenue']]

# 2 Degrees
poly2 = PolynomialFeatures(degree=2, include_bias=False)
polyX = poly2.fit_transform(budget)

linear = linear_model.LinearRegression()
linear.fit(polyX, grossRevenue)
yhat = linear.predict(polyX)

sns.scatterplot(data=df, x="Budget", y="Gross Revenue")
plt.plot(budget, yhat, color="green")
plt.title("Polynomial Regression: Budget vs. Gross Revenue (2 Degrees)")
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.show()

# 3 Degrees
poly3 = PolynomialFeatures(degree=3, include_bias=False)
polyX = poly3.fit_transform(budget)

linear = linear_model.LinearRegression()
linear.fit(polyX, grossRevenue)
yhat = linear.predict(polyX)

sns.scatterplot(data=df, x="Budget", y="Gross Revenue")
plt.plot(budget, yhat, color="orange")
plt.title("Polynomial Regression: Budget vs. Gross Revenue (3 Degrees)")
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.show()

# 3 Degrees
poly4 = PolynomialFeatures(degree=4, include_bias=False)
polyX = poly4.fit_transform(budget)

linear = linear_model.LinearRegression()
linear.fit(polyX, grossRevenue)
yhat = linear.predict(polyX)

sns.scatterplot(data=df, x="Budget", y="Gross Revenue")
plt.plot(budget, yhat, color="red")
plt.title("Polynomial Regression: Budget vs. Gross Revenue (4 Degrees)")
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.show()


This is our graph representing the Bivariate analysis of 'Budget' vs 'Gross Reveue'

In [None]:
df_GRV = df[['Gross Revenue', 'Votes']]
X_train, X_test, y_train, y_test = train_test_split(df_GRV.drop(['Votes'], axis=1), df_GRV.Votes, test_size=0.2, random_state=21)

model = LinearRegression()
# Fit model to the data
model.fit(X_train, y_train)
# Predict Y values using the fitted model
Y_pred = model.predict(X_train)

# Plot the regression line
plt.plot(X_train, Y_pred, color='red', label='Scikit Linear Regression')
sns.scatterplot(data=df, x="Gross Revenue", y="Votes")
plt.title('Gross Revenue vs Votes')
plt.xlabel('Gross Revenue')
plt.ylabel('Votes')

plt.show()

This is our graph representing the Bivariate analysis of 'Gross Revenue' vs 'Votes'

In [None]:
from sklearn import linear_model

df_VS = df[['Votes', 'Score']]
X_train, X_test, y_train, y_test = train_test_split(df_VS.drop(['Score'], axis=1), df_VS.Score, test_size=0.2, random_state=21)

model = LinearRegression()
# Fit model to the data
model.fit(X_train, y_train)
# Predict Y values using the fitted model
Y_pred = model.predict(X_train)

# Plot the regression line
plt.plot(X_train, Y_pred, color='red', label='Scikit Linear Regression')
sns.scatterplot(data=df, x="Votes", y="Score")
plt.title('Votes vs Score')
plt.xlabel('Votes')
plt.ylabel('Score')

plt.show()


This is our graph representing the Bivariate analysis of 'Vote' vs 'Score'

# TRAINING MODEL

### Data Preprocessing

In [46]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# # Separate features and the target variable
# # Assuming 'genre' as the target variable for this exercise
# X_movies = df.drop('Genre', axis=1)
# y_movies = df['Genre']

# # Dropping non-numeric columns to simplify the input features for neural network training
# X_movies_numeric = X_movies.select_dtypes(include=['float64', 'int64'])

# # Apply one-hot encoding to the target variable (genre)
# encoder_movies = OneHotEncoder()
# y_movies_encoded = encoder_movies.fit_transform(y_movies.values.reshape(-1, 1)).toarray()

# # Apply min-max normalization to the features
# scaler_movies = MinMaxScaler()
# X_movies_scaled = scaler_movies.fit_transform(X_movies_numeric)

# # Now, X_movies_scaled and y_movies_encoded are ready for the neural network training process
# df

# Selecting the required columns for features and target variable
X_movies = df[['Budget', 'Gross Revenue']]  # Features: budget and gross revenue
y_movies = df['Genre']  # Target variable: genre

# Apply one-hot encoding to the target variable (genre)
encoder_movies = OneHotEncoder()
y_movies_encoded = encoder_movies.fit_transform(y_movies.values.reshape(-1, 1)).toarray()

# Apply min-max normalization to the 'budget' and 'gross' features
scaler_movies = MinMaxScaler()
X_movies_scaled = scaler_movies.fit_transform(X_movies)

df

Unnamed: 0,Name,Rating,Genre,Year,Released,Score,Votes,Director,Writer,Star,...,Country_Spain,Country_Sweden,Country_Switzerland,Country_Taiwan,Country_Thailand,Country_United Arab Emirates,Country_United Kingdom,Country_United States,Country_West Germany,Country_Yugoslavia
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.400000,2.294269,Stanley Kubrick,Stephen King,Jack Nicholson,...,False,False,False,False,False,False,True,False,False,False
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.800000,0.173770,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,...,False,False,False,False,False,False,False,True,False,False
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.700000,2.525956,Irvin Kershner,Leigh Brackett,Mark Hamill,...,False,False,False,False,False,False,False,True,False,False
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.700000,1.092990,Jim Abrahams,Jim Abrahams,Robert Hays,...,False,False,False,False,False,False,False,True,False,False
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.300000,0.543980,Harold Ramis,Brian Doyle-Murray,Chevy Chase,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7648,Bad Boys for Life,R,Action,2020,"January 17, 2020 (United States)",6.600000,0.739239,Adil El Arbi,Peter Craig,Will Smith,...,False,False,False,False,False,False,False,True,False,False
7649,Sonic the Hedgehog,PG,Action,2020,"February 14, 2020 (United States)",6.500000,0.501529,Jeff Fowler,Pat Casey,Ben Schwartz,...,False,False,False,False,False,False,False,True,False,False
7650,Dolittle,PG,Adventure,2020,"January 17, 2020 (United States)",5.600000,0.029252,Stephen Gaghan,Stephen Gaghan,Robert Downey Jr.,...,False,False,False,False,False,False,False,True,False,False
7651,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.800000,-0.132540,Chris Sanders,Michael Green,Harrison Ford,...,False,False,False,False,False,False,False,False,False,False


### Training and Testing the Neural Network

In [None]:
# Split the data into training and testing sets with a 90:10 ratio
X_train, X_test, y_train, y_test = train_test_split(X_movies_scaled, y_movies_encoded, test_size=0.1, random_state=42)

# Define the model
model = Sequential([
    Dense(12, input_dim=X_train.shape[1], activation='sigmoid'),
    Dense(12, activation='sigmoid'),
    Dense(12, activation='sigmoid'),
    Dense(12, activation='sigmoid'),
    Dense(y_train.shape[1], activation='softmax')  # Number of output nodes = number of classes
])

# Compile the model with Stochastic Gradient Descent optimizer
optimizer = SGD(learning_rate=0.3)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy', 'mse'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.1, verbose=1)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Calculate metrics
cm = confusion_matrix(y_test_classes, y_pred_classes)
acc = accuracy_score(y_test_classes, y_pred_classes)
mse = mean_squared_error(y_test, y_pred)
precision = precision_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
recall = recall_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)

# Output the metrics
print("Confusion Matrix:", cm)
print("Accuracy:", acc)
print("MSE:", mse)
print("Precision:", precision)
print("Recall:", recall)

### k-fold Cross Validation

In [None]:
# Function to build the model, adapted for the Kaggle movie dataset
def build_movie_model():
    model = Sequential([
        Dense(12, input_dim=X_movies_scaled.shape[1], activation='sigmoid'),
        Dense(12, activation='sigmoid'),
        Dense(12, activation='sigmoid'),
        Dense(y_movies_encoded.shape[1], activation='sigmoid')  # Output layer nodes = number of classes
    ])
    model.compile(optimizer=SGD(learning_rate=0.3), loss='categorical_crossentropy', metrics=['accuracy', 'mse'])
    return model

# Wrap the Keras model with KerasClassifier
estimator_movies = KerasClassifier(model=build_movie_model, epochs=100, batch_size=10, verbose=1)  # Adjust verbosity as needed

# Define the cross-validation strategy
kfold_movies = RepeatedKFold(n_splits=10, n_repeats=1, random_state=42)

# Perform cross-validation for accuracy and MSE
results_movies = cross_validate(estimator_movies, X_movies_scaled, y_movies_encoded, cv=kfold_movies, scoring=['accuracy', 'neg_mean_squared_error'], n_jobs=1)

# Output results
print("Cross-validation results:")
print(results_movies)

In [None]:
# Assuming 'results_movies' contains the cross-validation results from the previous step
cv_accuracy_movies = results_movies['test_accuracy']
cv_mse_movies = -1 * results_movies['test_neg_mean_squared_error']  # Multiply by -1 to make MSE positive

# Print accuracy and MSE values for each fold
for i in range(len(cv_accuracy_movies)):
    print(f"Fold {i+1}: Accuracy = {cv_accuracy_movies[i]}, MSE = {cv_mse_movies[i]}")

# Print the overall average accuracy and MSE for the Kaggle movie dataset
print(f"Overall Average Accuracy: {np.mean(cv_accuracy_movies)}")
print(f"Overall Average MSE: {np.mean(cv_mse_movies)}")


### Hyperparameter Tuning

In [None]:
# Split the data
X_train_hp, X_test_hp, y_train_hp, y_test_hp = train_test_split(X_movies_scaled, y_movies_encoded, test_size=0.1, random_state=42)
def build_hp_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units', min_value=12, max_value=512, step=100),
                    activation='sigmoid',
                    input_dim=X_train_hp.shape[1]))
    model.add(Dense(units=hp.Int('units_layer2', min_value=12, max_value=512, step=100), activation='sigmoid'))
    model.add(Dense(units=hp.Int('units_layer3', min_value=12, max_value=512, step=100), activation='sigmoid'))
    model.add(Dense(y_train_hp.shape[1], activation='sigmoid'))  # Adapted for multi-class classification

    # Tune the learning rate for the optimizer
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')

    model.compile(optimizer=SGD(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
# Define the tuner
tuner = kt.Hyperband(
    build_hp_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=15,
    directory='tuner_results',
    project_name='kaggle_movie_tuning'
)

# Define the tuner
# tuner = kt.GridSearch(
#     hypermodel=build_hp_model,
#     objective='val_accuracy',
#     max_trials=10,
#     executions_per_trial=3,
#     seed=15,
#     tune_new_entries=True,
#     allow_new_entries=True,
#     directory='tuner_results',
#     project_name='keras_tuner_project'
# )

# Run the hyperparameter search
tuner.search(X_train_hp, y_train_hp, epochs=50, validation_split=0.1)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters()[0]

print(f"""
The optimal number of units in the first hidden layer is {best_hps.get('units')}
with an optimal learning rate of {best_hps.get('learning_rate')}.
""")

# Build the model with the optimal hyperparameters and evaluate it
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train_hp, y_train_hp, epochs=50, validation_data=(X_test_hp, y_test_hp))

Trial 60 Complete [00h 00m 05s]
val_accuracy: 0.2931392788887024

Best val_accuracy So Far: 0.2931392788887024
Total elapsed time: 00h 31m 13s

The optimal number of units in the first hidden layer is 312
with an optimal learning rate of 0.00020221406050328463.

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Evaluate our model compare training vs test error

# Model fit in the fitting graph

# Next 2 models

# Conclusion

## What is the conclusion of your 1st model?

## What can be done to possibly improve it?