In [None]:
import pandas as pd

# Example synthetic data
data = {
    'rating': [7.5, 8.2, 6.0, 7.9, 8.5, 6.8, 7.3, 8.1, 7.7, 6.5],
    'genre': ['Action', 'Comedy', 'Drama', 'Action', 'Romance', 'Comedy', 'Drama', 'Action', 'Romance', 'Action'],
    'director': ['Director1', 'Director2', 'Director3', 'Director1', 'Director2', 'Director3', 'Director1', 'Director2', 'Director3', 'Director1'],
    'actors': ['Actor1', 'Action2', 'Action2', 'Actor3', 'Actor1', 'Action4', 'Action1', 'Actor3', 'Action2', 'Action3'] # Corrected the length of the 'actors' list to match other lists
}

df = pd.DataFrame(data)

# Preview the dataset
print(df.head())

   rating    genre   director   actors
0     7.5   Action  Director1   Actor1
1     8.2   Comedy  Director2  Action2
2     6.0    Drama  Director3  Action2
3     7.9   Action  Director1   Actor3
4     8.5  Romance  Director2   Actor1


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# One-hot encode 'genre' and 'director'
df_encoded = pd.get_dummies(df, columns=['genre', 'director'], drop_first=True)

# Convert 'actors' into a numeric feature using CounterVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
actors_matrix = vectorizer.fit_transform(df['actors'])
# Convert the actors matrix into a DataFrame
actors_df = pd.DataFrame(actors_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatentate actors_df with the original dataframe
df_encoded = pd.concat([df_encoded, actors_df], axis=1)

# Drop the original 'actors' column since it's now encoded
df_encoded.drop('actors', axis=1, inplace=True)

# Preview the processed data
print(df_encoded.head())

   rating  genre_Comedy  genre_Drama  genre_Romance  director_Director2  \
0     7.5         False        False          False               False   
1     8.2          True        False          False                True   
2     6.0         False         True          False               False   
3     7.9         False        False          False               False   
4     8.5         False        False           True                True   

   director_Director3  action1  action2  action3  action4  actor1  actor3  
0               False        0        0        0        0       1       0  
1               False        0        1        0        0       0       0  
2                True        0        1        0        0       0       0  
3               False        0        0        0        0       0       1  
4               False        0        0        0        0       1       0  




In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = df_encoded.drop('rating', axis=1)
y = df_encoded['rating']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
lin_reg_model = LinearRegression()

# Train the model
lin_reg_model.fit(X_train, y_train)

# Make predicitions on the test set
y_pred_lin_reg = lin_reg_model.predict(X_test)

# Evaluate the model
mse_lin_reg = mean_squared_error(y_test, y_pred_lin_reg)
r2_lin_reg = r2_score(y_test, y_pred_lin_reg)

print(f"Linear Regression - MSE: {mse_lin_reg:.2f}, R-squared: {r2_lin_reg:.2f}")

Linear Regression - MSE: 1.02, R-squared: -15.25


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predicitions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regressor - MSE: {mse_rf:.2f}, R-squared: {r2_rf:.2f}")


Random Forest Regressor - MSE: 0.80, R-squared: -11.74
