In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
data = pd.read_csv('../../../3. Data Preparation/3.3 Construct Data/merged_data_mod_scores.csv')

# Set up a directory to save the plots
output_directory = 'eda_plots/'

# Create pair plots for numeric columns
sns.pairplot(data, diag_kind='kde')
plt.suptitle('Pair Plots of Numeric Variables')
plt.savefig(output_directory + 'pair_plots.png')
plt.close()

# Create a heatmap to visualize correlation between numeric variables
correlation_matrix = data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig(output_directory + 'correlation_heatmap.png')
plt.close()

# Create a histogram of 'IMDB Domestic Revenue'
plt.figure(figsize=(8, 6))
sns.histplot(data['IMDB Domestic Revenue'], bins=20, kde=True)
plt.title('Distribution of IMDB Domestic Revenue')
plt.xlabel('IMDB Domestic Revenue')
plt.savefig(output_directory + 'revenue_distribution.png')
plt.close()

# Create boxplots for categorical variables
categorical_columns = ['Genres', 'MPAA', 'Domestic Distributor', 'Genre Name']
for column in categorical_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=column, y='IMDB Domestic Revenue', data=data)
    plt.title(f'Boxplot of IMDB Domestic Revenue by {column}')
    plt.xticks(rotation=45)
    plt.savefig(output_directory + f'{column}_boxplot.png')
    plt.close()

# Create a countplot for Release Year
plt.figure(figsize=(10, 6))
sns.countplot(x='Release Year', data=data, order=sorted(data['Release Year'].unique()))
plt.title('Count of Movies by Release Year')
plt.xticks(rotation=45)
plt.savefig(output_directory + 'release_year_countplot.png')
plt.close()

# You can add more plots based on your specific analysis needs

# Save all the plots in the output directory
print("Plots saved in:", output_directory)


Plots saved in: eda_plots/
