In [1]:
import pandas as pd
import numpy as np
import random

# Setting the number of samples
n_students = 2000
n_courses = 20

# Nominal data: Names
# For the first names
first_names = [
    'John', 'Jane', 'Michael', 'Emily', 'Chris', 'Katie', 'David', 'Sarah',
    'James', 'Jessica', 'Chijioke', 'Ngozi', 'Ifeoma', 'Emeka', 'Chinonso',
    'Paschal', 'Tunde', 'Adeola', 'Kwame', 'Amina', 'Fatima', 'Laila', 'Sofia',
    'Carlos', 'Ahmed', 'Priya', 'Chen', 'Liu', 'Omar', 'Maria', 'Ali', 'Fatoumata']

# For the last names
last_names = [
    'Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller',
    'Davis', 'Rodriguez', 'Martinez', 'Okafor', 'Nwosu', 'Adams', 'Okwuosa',
    'Chukwu', 'Adetokunbo', 'Nguyen', 'Chen', 'Kim', 'Patel', 'Uzoegwu',
    'Ali', 'Santos', 'Kumar', 'Osei', 'Cheng', 'Silva', 'Müller']

name_data = [f'{random.choice(first_names)} {random.choice(last_names)}' for _ in range(n_students)]

# Nominal data: Domicile and Region
countries = ['UK', 'India', 'China', 'Nigeria', 'United States', 'Germany', 'Brazil', 'Japan', 'Canada', 'Australia']
domicile_data = np.random.choice(countries, n_students)

regions = {
    'UK': ['London', 'Yorkshire', 'Edinburgh'],
    'India': ['Delhi', 'Maharashtra', 'Tamil Nadu'],
    'China': ['Beijing', 'Shanghai'],
    'Nigeria': ['Lagos', 'Abuja'],
    'United States': ['California', 'New York'],
    'Germany': ['Berlin', 'Bavaria'],
    'Brazil': ['São Paulo', 'Rio de Janeiro'],
    'Japan': ['Tokyo', 'Osaka'],
    'Canada': ['Ontario', 'Quebec'],
    'Australia': ['New South Wales', 'Victoria']}

region_data = [random.choice(regions.get(country, ['Unknown'])) for country in domicile_data]

# Ordinal data: Courses
courses = [
    'Business Administration', 'Marketing', 'Finance', 'Economics',
    'Electrical Engineering', 'Mechanical Engineering', 'Data Science',
    'Artificial Intelligence', 'Cybersecurity', 'Computer Science',
    'Public Health', 'Nursing', 'Psychology', 'Law', 'History', 'Sociology',
    'Political Science', 'International Relations', 'Philosophy', 'Graphic Design']

course_data = np.random.choice(courses, n_students)

# Interval data: GPA and Age
gpa_data = np.round(np.random.uniform(2.0, 4.0, n_students), 2)
age_data = np.random.randint(22, 45, n_students)

# Nominal data: Gender and Admission Status
gender_data = np.random.choice(['Male', 'Female'], n_students)
admission_status = np.random.choice(['Conditional', 'Unconditional'], n_students)

# Ratio data: Admission (1 or 0)
admitted_data = np.random.choice([1, 0], n_students, p=[0.75, 0.25])

# Introduce missing values in GPA
n_missing_gpa = 50
missing_indices = np.random.choice(gpa_data.size, n_missing_gpa, replace=False)
gpa_data[missing_indices] = np.nan

# Create DataFrames
students_df = pd.DataFrame({
    'Student_ID': np.arange(1001, 1001 + n_students),
    'Name': name_data,
    'Domicile': domicile_data,
    'Region': region_data,
    'Gender': gender_data,
    'Age': age_data,
    'Admission_Status': admission_status})

courses_df = pd.DataFrame({
    'Course_ID': np.arange(1, n_courses + 1),
    'Course_Name': courses[:n_courses],
    'Course_Level': ['Undergraduate' if i % 2 == 0 else 'Postgraduate' for i in range(n_courses)]})

enrollments_df = pd.DataFrame({
    'Enrollment_ID': np.arange(1, n_students + 1),
    'Student_ID': np.random.choice(students_df['Student_ID'], n_students),
    'Course_ID': np.random.choice(courses_df['Course_ID'], n_students),
    'GPA': gpa_data,
    'Admitted': admitted_data})

# Saving DataFrames to CSV
students_df.to_csv('students.csv', index=False)
courses_df.to_csv('courses.csv', index=False)
enrollments_df.to_csv('enrollments.csv', index=False)

print(" successfully Done: 'students.csv', 'courses.csv', 'enrollments.csv'")


 successfully Done: 'students.csv', 'courses.csv', 'enrollments.csv'
