In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(r"D:\Project 4\udemy_output_All_Finance__Accounting_p1_p626.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Cell 3: Data Cleaning & Preprocessing
print("Missing Values Before Cleaning:")
print(df.isnull().sum())

# Handle missing values
df_clean = df.copy()
# Fill numerical missing values with median
numerical_cols = ['num_subscribers', 'avg_rating', 'num_reviews', 'num_lectures', 'content_length_min']
for col in numerical_cols:
    if col in df_clean.columns:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Fill categorical missing values with mode
categorical_cols = ['title', 'level', 'subject']
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col].fillna(df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown', inplace=True)

print("\nMissing Values After Cleaning:")
print(df_clean.isnull().sum())

print(f"\nDataset cleaned: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")

In [None]:
# Plot 1: Distribution of subscribers
plt.figure(figsize=(10, 4))
plt.hist(df_clean['num_subscribers'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Number of Subscribers')
plt.xlabel('Number of Subscribers')
plt.ylabel('Frequency')


In [None]:
# Plot 2: Distribution of ratings
plt.figure(figsize=(10, 4))
plt.hist(df_clean['avg_rating'], bins=20, edgecolor='black', alpha=0.7, color='orange')
plt.title('Distribution of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')

In [None]:
# Plot 4: Price distribution
plt.figure(figsize=(10, 4))
plt.hist(df_clean['price_detail__amount'], bins=30, edgecolor='black', alpha=0.7, color='red')
plt.title('Distribution of Course Prices')
plt.xlabel('price_detail__amount')
plt.ylabel('Frequency')

In [None]:
plt.figure(figsize=(10, 6))
numeric_df = df_clean.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')

plt.tight_layout()
plt.show()