Script to create a csv file with random data for the LMS dashboard project.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_lms_data():
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Generate base user data
    user_ids = list(range(1001, 2001))  # 1000 unique users
    batch_numbers = [f"B{str(num).zfill(3)}" for num in range(1, 21)]  # 20 batches
    
    # Lists to store all rows
    all_data = []
    
    # Generate feature names and their associated columns
    feature_names = [
        'Notification', 'LiveClass', 'Classroom', 'Curriculum', 
        'ShortCourses', 'Masterclass', 'BookMentor', 'DoubtSession',
        'Assignments', 'Resources', 'Recordings', 'CodingWindow',
        'QuestionBank', 'HelpTicket', 'ReferFriend'
    ]
    
    # Start date and end date (one month period)
    start_date = datetime(2024, 3, 1)
    end_date = datetime(2024, 3, 31)
    
    # Generate data for each user
    for user_id in user_ids:
        # Generate 5-15 interactions per user
        num_interactions = random.randint(5, 15)
        
        for _ in range(num_interactions):
            # Basic user info
            row = {
                'user_id': user_id,
                'user_name': f"User_{user_id}",
                'batch_no': random.choice(batch_numbers),
                'date': (start_date + timedelta(
                    days=random.randint(0, (end_date - start_date).days)
                )).strftime('%Y-%m-%d')
            }
            
            # Generate feature usage data
            for feature in feature_names:
                # CTR (Click-Through Rate) - between 0 and 1
                row[f'ctr_{feature.lower()}'] = round(random.uniform(0, 1), 3)
                
                # Session duration (in minutes) - between 1 and 120 minutes
                row[f'{feature.lower()}_duration'] = round(random.uniform(1, 120), 1)
                
                # Conversion and drop-off (mutually exclusive)
                conversion = random.choice([0, 1])
                row[f'{feature.lower()}_conversion'] = conversion
                row[f'{feature.lower()}_dropoff'] = 1 - conversion
                
                # NPS (mostly empty, occasionally filled)
                if random.random() < 0.1:  # 10% chance of having NPS
                    row[f'{feature.lower()}_nps'] = random.randint(0, 100)
                else:
                    row[f'{feature.lower()}_nps'] = None
            
            all_data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Sort by user_id and date
    df = df.sort_values(['user_id', 'date'])
    
    # Save to CSV
    df.to_csv('lms_user_data.csv', index=False)
    
    return df

# Generate the data
df = generate_lms_data()

# Print first few rows and basic info
print(df.info())
print("\nFirst few rows:")
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 10047 entries, 2 to 10042
Data columns (total 79 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   user_id                  10047 non-null  int64  
 1   user_name                10047 non-null  object 
 2   batch_no                 10047 non-null  object 
 3   date                     10047 non-null  object 
 4   ctr_notification         10047 non-null  float64
 5   notification_duration    10047 non-null  float64
 6   notification_conversion  10047 non-null  int64  
 7   notification_dropoff     10047 non-null  int64  
 8   notification_nps         1007 non-null   float64
 9   ctr_liveclass            10047 non-null  float64
 10  liveclass_duration       10047 non-null  float64
 11  liveclass_conversion     10047 non-null  int64  
 12  liveclass_dropoff        10047 non-null  int64  
 13  liveclass_nps            996 non-null    float64
 14  ctr_classroom            10