In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def process_dataset(file_path):
    # Step 1: Load the dataset
    try:
        df = pd.read_csv(file_path)
        print("Dataset loaded successfully.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return
    
    # Step 2: Show basic information about the dataset
    print("Initial dataset information:")
    print(df.info())
    
    # Step 3: Handle missing values
    print("\nHandling missing values...")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # Filling missing numeric data with mean
    df.dropna(axis=1, how='all', inplace=True)  # Dropping columns with all NaN values
    print("Missing values handled.")
    
    # Step 4: Remove duplicates
    print("\nRemoving duplicates...")
    df.drop_duplicates(inplace=True)
    print(f"Data shape after removing duplicates: {df.shape}")
    
    # Step 5: Encoding categorical variables (if needed)
    print("\nEncoding categorical variables...")
    df = pd.get_dummies(df, drop_first=True)
    print("Categorical variables encoded.")
    
    # Step 6: Data Scaling (Optional)
    print("\nScaling numeric data...")
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print("Numeric data scaled.")
    
    # Step 7: Generate summary statistics
    print("\nSummary statistics:")
    print(df.describe())
    
    
    # Step 9: Save processed dataset
    processed_file = "processed_data.csv"
    df.to_csv(processed_file, index=False)
    print(f"Processed dataset saved as {processed_file}")
    
    return df

# Example usage:
file_path = "online_courses_uses.csv"  # Replace this with the path to your dataset
# process_dataset(file_path)  # Uncomment to run the function


In [31]:
process_dataset(file_path)

Dataset loaded successfully.
Initial dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Course_ID            10000 non-null  int64  
 1   Course_Name          10000 non-null  object 
 2   Category             10000 non-null  object 
 3   Duration (hours)     10000 non-null  int64  
 4   Enrolled_Students    10000 non-null  int64  
 5   Completion_Rate (%)  10000 non-null  float64
 6   Platform             10000 non-null  object 
 7   Price ($)            10000 non-null  float64
 8   Rating (out of 5)    10000 non-null  float64
dtypes: float64(3), int64(3), object(3)
memory usage: 703.3+ KB
None

Handling missing values...
Missing values handled.

Removing duplicates...
Data shape after removing duplicates: (10000, 9)

Encoding categorical variables...
Categorical variables encoded.

Scaling numeric data...


Unnamed: 0,Course_ID,Duration (hours),Enrolled_Students,Completion_Rate (%),Price ($),Rating (out of 5),Course_Name_Course_10,Course_Name_Course_100,Course_Name_Course_1000,Course_Name_Course_10000,...,Category_Data Science,Category_Design,Category_Finance,Category_Marketing,Category_Office Tools,Category_Programming,Category_Technology,Platform_LinkedIn Learning,Platform_Udemy,Platform_edX
0,-1.731878,-1.303309,1.184451,-1.692290,-1.226796,1.419871,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,-1.731531,0.070845,1.199201,0.492380,0.984786,-0.286416,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,-1.731185,-0.120009,0.118945,-1.340858,0.310582,1.490593,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3,-1.730838,0.528897,1.248367,-1.137855,0.188471,-0.139750,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
4,-1.730492,-0.463548,1.588318,-0.865861,-0.184120,1.612264,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.730492,1.215974,-1.438927,-1.363094,0.899743,0.280630,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9996,1.730838,1.559512,-0.352349,1.242877,0.810032,0.075765,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
9997,1.731185,-1.150625,-1.681949,1.048292,-0.095035,-0.678201,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
9998,1.731531,-0.349035,0.510872,-0.096780,-0.095250,-0.292581,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
