# Imports

In [None]:
!pip install --upgrade plotnine mizani

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from plotnine import *

# Data Understanding

In [None]:
# Reading the Data
df = pd.read_csv("/kaggle/input/imdb-prediction-by-123-of-ai-weekend-hackathon/train_data.csv")

In [None]:
# Displaying the first 10 records
df.head(10)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isnull().sum(axis = 0)

Looks like our data had 2 null entries for languague field, so i am removing the missing values with dropna() method

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
# Create a histogram of IMDb scores
figure1 = df['imdb_score'].hist(bins=100, figsize=[14, 8])

# Set labels and title
plt.xlabel('imdb_score')
plt.ylabel('frequency')
plt.title('Distribution of IMDb Scores')

# Show the histogram
plt.show()

In [None]:
df['imdb_score'].std()

In [None]:
df['imdb_score'].var()

In [None]:
# Relationship between the imdb score and the profit made by the movie
sns.lmplot(x='imdb_score', y='Profit', data=df, scatter_kws={"s": 5})
plt.xlabel('imdb_score')
plt.ylabel('profit')
plt.title('Relationship between IMDb Score and Profit')
plt.show()

In [None]:
# Top 20 actors of movies based on the imdb rating of the movies

plt.figure(figsize=(16, 12))

# new dataframe with top 20 values
new_df = df.sort_values(by ='imdb_score' , ascending=False)
new_df = new_df.head(20)

# plotting
ax=sns.pointplot(new_df, x = 'actor_1_name', y = 'imdb_score', hue= 'movie_title')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('actor_1_name')
plt.ylabel('imdb_score')
plt.title('Relationship between top 20 actors of movies based on the imdb rating of the movies')
plt.tight_layout()
plt.show()

In [None]:
df.duplicated().sum()    # total sum of duplicate rows

### Pre-processing

In [None]:
# Correlation with heat map - to find to which feature is similar to which other
# those above 0.5 corr score have significant overlap in information

numeric_df = df.select_dtypes(include=['number'])

# calc corr
corr = numeric_df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(16,12))

# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr, mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

In [None]:
def function1(a1, a2):
    return a1 + a2

df['other_actors_facebook_likes'] = df.apply(lambda x: function1(x['actor_2_facebook_likes'], x['actor_3_facebook_likes']), axis=1)

In [None]:
df.drop(['actor_2_facebook_likes', 'actor_3_facebook_likes' ,'num_voted_users', 'facenumber_in_poster', 'Profit', 'aspect_ratio', 'cast_total_facebook_likes'], axis=1, inplace=True)


In [None]:
df.info()

In [None]:
### Tip 1 - Handle different kinds of data types

### Handling categorical data- country
df2 = pd.get_dummies(data = df, columns = ['country'] , prefix = ['country'] , drop_first = True)

### Handling categorical data- content_rating
df2 = pd.get_dummies(data = df2, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)

### Handling categorical data- language
df2 = pd.get_dummies(data = df2, columns = ['language'] , prefix = ['language'] , drop_first = True)

### Handling categorical data- actor_1_name
df2 = pd.get_dummies(data = df2, columns = ['director_name'] , prefix = ['director_name'] , drop_first = True)

df2.head(10)


### Handle different kinds of data - text etc., as you see fit

In [None]:
df2.info()

In [None]:
### handling textual data 

### Handling categorical data- plot_keywords
df3 = df.copy()
df3['plot_keywords'] = df3['plot_keywords'].str.split('|')
df3['plot_keyword_count'] = df3['plot_keywords'].apply(len)

### Handling categorical data- geners                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
df3['genres'] = df3['genres'].str.split('|')
df3['genres_count'] = df3['genres'].apply(len)

df3.head(10)

In [None]:
df3.info()

In [None]:
# Correlation with heat map - to find to which feature is similar to which other
# those above 0.5 corr score have significant overlap in information

numeric_df2 = df3.select_dtypes(include=['number'])

# calc corr
corr = numeric_df2.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(16,12))

# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr, mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

In [None]:
### Tip 2 - Based on above data analysis, choose all or relevant features
### Tip 3 - Feel free to extract most important features using PCA, regularisation, above correlation heatmaps etc.
### Feel free to convert the text to textual feature vectors, and use those as input too.

In [None]:
numeric_df2.shape

In [None]:
numeric_df2.info()

In [None]:
# Tip 5- Don't forget feature scaling; check K-NN bootcamp text
# standardization (Z-score scaling)
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the 'duration' column
numeric_df2['duration'] = scaler.fit_transform(numeric_df2[['duration']])


In [None]:
numeric_df2.info()

In [None]:
numeric_df2.duplicated().sum()    # total sum of duplicate rows

## regularisation model for feature selcection 

In [None]:
from sklearn.model_selection import train_test_split
# Tip 4 - pandas data frames can be directly used in train and test split creation
x = numeric_df2.drop(['imdb_score'], axis=1)  # Features
y = numeric_df2['imdb_score']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score, mean_squared_error

w = []
r2 = []
alpha_list = [0.001, 0.01, 0.1, 1.0, 10.0]

for alpha in alpha_list:
  #fit model
  model = Lasso(alpha=alpha)
  model.fit(X_train, y_train)

  #prediction
  y_pred = model.predict(X_test)

  #store metrics for prediction for each fit
  r2.append(r2_score(y_test, y_pred))
  w.append(model.coef_.tolist())

In [None]:
print(pd.DataFrame({'w': w}))

In [None]:
for coef in w:
    # Initialize a Linear Regression model with the Lasso coefficients
    model = LinearRegression()
    model.coef_ = coef  # Set the coefficients

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Evaluate the model's performance
    mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
    r2 = r2_score(y_test, y_pred)            # R-squared

    # Print the model's performance metrics
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

In [None]:
from sklearn import metrics
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

### Model Selection

In [None]:
### Experiment with different regression models
### https://scikit-learn.org/stable/supervised_learning.html

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

x = numeric_df2.drop(['imdb_score'], axis=1)  # Features
y = numeric_df2['imdb_score']  # Target variable

# Perform PCA to reduce dimensionality to 4 components
pca = PCA(n_components=4)
pca_result = pca.fit_transform(x)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pca_result, y, test_size=0.2, random_state=5)

# Train a Linear Regression model
lm = LinearRegression()
lm.fit(X_train, y_train)

# Make predictions using Linear Regression
y_pred_lm = lm.predict(X_test)

# Calculate RMSE for Linear Regression
rmse_lm = np.sqrt(mean_squared_error(y_test, y_pred_lm))
print("RMSE for Linear Regression:", rmse_lm)

# Calculate R-squared for Linear Regression
r2_lm = r2_score(y_test, y_pred_lm)
print("R-squared for Linear Regression:", r2_lm * 100)

# Train a Random Forest Regressor model
rf = RandomForestRegressor(random_state=5, max_depth=1000)
rf.fit(X_train, y_train)

# Make predictions using Random Forest Regressor
y_pred_rf = rf.predict(X_test)

# Calculate RMSE for Random Forest Regressor
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("RMSE for Random Forest Regressor:", rmse_rf)

# Calculate R-squared for Random Forest Regressor
r2_rf = r2_score(y_test, y_pred_rf)
print("R-squared for Random Forest Regressor:", r2_rf * 100)

# Train an XGBoost Regressor model with cross-validation
xgb_reg = xgb.XGBRegressor(random_state=5, max_depth=3, n_estimators=100)
scores = cross_val_score(xgb_reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Calculate RMSE for XGBoost with cross-validation
rmse_xgb = np.sqrt(-scores)
print("RMSE for XGBoost with Cross-Validation:", rmse_xgb)

# Fit the XGBoost model on the full training data
xgb_reg.fit(X_train, y_train)

# Make predictions using XGBoost
y_pred_xgb = xgb_reg.predict(X_test)

# Calculate R-squared for XGBoost
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R-squared for XGBoost:", r2_xgb * 100)

In [None]:
x.info()

### Evaluation and saving output

In [None]:
# Creating output file for submission - Template Code

test = pd.read_csv('/kaggle/input/imdb-prediction-by-123-of-ai-weekend-hackathon/test_data_with_inputs.csv')

def fun1(a1, a2):
    return a1 + a2

test['other_actors_facebook_likes'] = test.apply(lambda x: fun1(x['actor_2_facebook_likes'], x['actor_3_facebook_likes']), axis=1)

test.drop(['actor_2_facebook_likes', 'actor_3_facebook_likes' ,'num_voted_users', 'facenumber_in_poster', 'Profit', 'aspect_ratio', 'cast_total_facebook_likes'], axis=1, inplace=True)

### Handling categorical data- plot_keywords
test['plot_keywords'] = test['plot_keywords'].str.split('|')
test['plot_keyword_count'] = test['plot_keywords'].apply(len)

### Handling categorical data- geners                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
test['genres'] = test['genres'].str.split('|')
test['genres_count'] = test['genres'].apply(len)

numeric_test = test.select_dtypes(include=['number'])

from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the 'duration' column
numeric_test['duration'] = scaler.fit_transform(numeric_test[['duration']])

numeric_test.info()

# Perform PCA to reduce dimensionality to 4 components
pca = PCA(n_components=4)
pca_result_test = pca.fit_transform(numeric_test)

# Convert all submission data to same input format as done for train data
# run prediction as y_pred = model.predict(X_test)
y_pred_xgb = xgb_reg.predict(pca_result_test)
# y_pred contains IMDB scores

submission = pd.DataFrame({'s_no':test.s_no, 'imdb_score':y_pred_rf[..., 0]}).set_index('s_no')
submission.to_csv('output_submission.csv')