In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Membaca dataset
data_nilai = pd.read_csv('dataset/Sample Nilai Alumni Prodi Informatika_Ver1.csv')
data_profesi = pd.read_csv('dataset/Sample Profesi Pekerjaan Alumni Prodi Informatika Universitas Gunadarma_Ver1.csv')

In [None]:
data_nilai.head(10)

In [None]:
data_profesi.head(10)

In [5]:
# Remove the columns 'KDMK' and 'JENIS' from the 'nilai_alumni' dataset
nilai_alumni_cleaned = data_nilai.drop(columns=['KDMK', 'JENIS'])

# Remove the column 'Nama Lengkap' from the 'data_profesi' dataset
data_profesi_cleaned = data_profesi.drop(columns=['Nama Lengkap'])

In [6]:
# Merge the two datasets based on 'NPM' and perform one-hot encoding on the 'NAMA MK' (subject names) column
merged_data = pd.merge(data_profesi_cleaned, nilai_alumni_cleaned, on='NPM')

In [None]:
merged_data.head(10)

In [None]:
# To see the count of each grade
grade_counts = merged_data['NILAI'].value_counts()
grade_counts


In [9]:
# Change the data type of the 'NAMA MK' column to INDEX
merged_data['INDEX'] = merged_data['NAMA MK']

In [None]:

# Pivots the merged data table to create a table with the student's NPM, IPK, Klasifikasi Profesi as rows, and the subject indices as columns, with the student's grades as the values.

# The `pivot_table` function is used to reshape the data from a long format to a wide format, with the subject indices as the column names and the grades as the values. The `first` aggregation function is used to handle any duplicate grades for a student in a subject.

# The resulting `pivoted_grades` DataFrame will have the student information (NPM, IPK, Klasifikasi Profesi) as the index, and the subject grades as the columns, with 0 filled in for any subjects not taken by a student.


pivoted_grades = merged_data.pivot_table(
    index=['NPM', 'IPK', 'Klasifikasi Profesi'], 
    columns='INDEX', 
    values='NILAI', 
    aggfunc='first'  # Take the first occurrence if there are duplicates
).reset_index()

# Merge the pivoted table back to the original data to align with the rest of the information
# This will leave 0 for subjects not taken by each student

pivoted_grades.fillna(0, inplace=True)

pivoted_grades.head(10)

In [None]:
# Replace NaN values with 0 for the grade columns
pivoted_grades_filled = pivoted_grades.fillna(0)

# Display the updated data with NaN replaced by 0
pivoted_grades_filled.head(10)

In [None]:
# Define the grade mapping
grade_mapping = {'A': 4, 'B': 3, 'C': 2, 'D': 1,}

# Apply the grade mapping to all relevant columns that contain grades
# We'll apply this mapping to all columns except 'NPM', 'IPK', 'Klasifikasi', and 'Profesi'

# Selecting only grade columns
grade_columns = pivoted_grades_filled.columns.difference(['NPM', 'IPK', 'Klasifikasi', 'Profesi'])

# Apply grade mapping to these columns
pivoted_grades_filled[grade_columns] = pivoted_grades_filled[grade_columns].replace(grade_mapping)


# Display the updated dataframe
pivoted_grades_filled.head(10)

In [None]:
# Get a random NPM from the data
random_npm = pivoted_grades_filled['NPM'].sample().values[0]

print(f"Randomly selected NPM: {random_npm}")

# Get the data for this random NPM
random_student_data = pivoted_grades_filled[pivoted_grades_filled['NPM'] == random_npm]

# Display the data for the randomly selected student
random_student_data

In [None]:
# Create a binary mask where 0 is True and non-zero is False
mask = pivoted_grades_filled.isnull()

# Set up the matplotlib figure
plt.figure(figsize=(20, 10))

# Create the heatmap
sns.heatmap(mask, cmap='binary', cbar=False, yticklabels=False)

# Set the title
plt.title('Missing Values Heatmap')

# Show the plot
plt.show()

## Eksplorasi Data

In [None]:
# Step 1: Basic statistics of the numerical columns
basic_stats = pivoted_grades_filled.describe()

# Step 2: Distribution of predicted professions
profession_distribution = pivoted_grades_filled['Klasifikasi Profesi'].value_counts()

basic_stats

In [None]:
profession_distribution

In [None]:
# Check for missing values
missing_values = pivoted_grades_filled.isnull().sum()

# Check data distribution for the target column 'Profesi'
profesi_distribution = pivoted_grades_filled['Klasifikasi Profesi'].value_counts()

missing_values, profesi_distribution


## Modelling

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode the target variable 'Klasifikasi Profesi'
label_encoder = LabelEncoder()
pivoted_grades_filled['Klasifikasi Profesi Encoded'] = label_encoder.fit_transform(pivoted_grades_filled['Klasifikasi Profesi'])

# Exclude non-numeric columns and scale only the numeric features
numeric_cols = pivoted_grades_filled.columns[3:-1]  # Skipping NPM, Profesi, and the new encoded column

# Scale the numeric features
scaler = StandardScaler()
pivoted_grades_filled[numeric_cols] = scaler.fit_transform(pivoted_grades_filled[numeric_cols])

# Separate the features (X) and target (y)
X = pivoted_grades_filled[numeric_cols]
y = pivoted_grades_filled['Klasifikasi Profesi Encoded']

# Display the first few rows after scaling and encoding
pivoted_grades_filled.head()


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# Set the dimension of the latent space
latent_dim = 100
input_dim = X.shape[1]  # Number of features in the dataset

# Build the generator
def build_generator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, input_dim=latent_dim, activation='relu'))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(input_dim, activation='tanh'))
    return model

# Build the discriminator
def build_discriminator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(1))
    return model

# Create the generator and discriminator
generator = build_generator()
discriminator = build_discriminator()

# Define the Wasserstein loss function
def wasserstein_loss(y_true, y_pred):
    return tf.keras.backend.mean(y_true * y_pred)

# Compile the discriminator
discriminator.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.00005), loss=wasserstein_loss)

# Build and compile the combined GAN model
discriminator.trainable = False
gan_input = layers.Input(shape=(latent_dim,))
generated_data = generator(gan_input)
gan_output = discriminator(generated_data)
gan_model = tf.keras.Model(gan_input, gan_output)
gan_model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.00005), loss=wasserstein_loss)

gan_model.summary()


In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [22]:
gpus = tf.config.experimental.list_physical_devices('/GPU:0')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

# Define dimensions
latent_dim = 100
input_dim = X.shape[1]  # Number of features in the dataset

# Build generator
def build_generator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, input_dim=latent_dim, activation='relu'))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(input_dim, activation='tanh'))
    return model

# Build discriminator
def build_discriminator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(1))
    return model

# Create models
generator = build_generator()
discriminator = build_discriminator()

# Define Wasserstein loss
def wasserstein_loss(y_true, y_pred):
    return tf.keras.backend.mean(y_true * y_pred)

# Compile discriminator
discriminator.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.00005), loss=wasserstein_loss)

# Create GAN model
discriminator.trainable = False
gan_input = layers.Input(shape=(latent_dim,))
generated_data = generator(gan_input)
gan_output = discriminator(generated_data)
gan_model = tf.keras.Model(gan_input, gan_output)
gan_model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.00005), loss=wasserstein_loss)

# GAN training loop with concise output and no progress bars
def train_gan(epochs, batch_size):
    with tf.device('/device:GPU:0'):  # Ensures the operations are on the GPU
        for epoch in range(epochs):
            d_losses = []
            a_losses = []
            
            # Update the critic (discriminator)
            for _ in range(5):  # Critic updates
                random_latent_vectors = np.random.normal(size=(batch_size, latent_dim))
                generated_samples = generator.predict(random_latent_vectors, verbose=0)  # Disable progress bar
                real_samples = X.sample(batch_size).values
                combined_samples = np.concatenate([generated_samples, real_samples])
                labels = np.concatenate([np.ones((batch_size, 1)), -np.ones((batch_size, 1))])
                d_loss = discriminator.train_on_batch(combined_samples, labels)
                d_losses.append(d_loss)

            # Update the generator (adversarial network)
            random_latent_vectors = np.random.normal(size=(batch_size, latent_dim))
            misleading_targets = -np.ones((batch_size, 1))
            a_loss = gan_model.train_on_batch(random_latent_vectors, misleading_targets)
            a_losses.append(a_loss)

            # Print concise output for the current epoch
            if epoch % 100 == 0 or epoch == epochs - 1:
                print(f"Epoch {epoch}/{epochs} - Discriminator Loss: {np.mean(d_losses):.4f} - Adversarial Loss: {np.mean(a_losses):.4f}")

# Train the GAN
train_gan(epochs=10000, batch_size=32)


## Model Evaluation

In [None]:
# Generate a batch of new data samples
latent_vectors = np.random.normal(size=(32, latent_dim))
generated_samples = generator.predict(latent_vectors)

# Compare with real data
real_samples = X.sample(32).values

# For example, check the mean and standard deviation of real vs generated samples
real_mean = np.mean(real_samples, axis=0)
generated_mean = np.mean(generated_samples, axis=0)

real_std = np.std(real_samples, axis=0)
generated_std = np.std(generated_samples, axis=0)

print("Mean difference between real and generated data:", np.mean(np.abs(real_mean - generated_mean)))
print("Std difference between real and generated data:", np.mean(np.abs(real_std - generated_std)))


## Evaluate by Classifying Generated Data

In [None]:
# Train a simple classifier on real data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Now evaluate classifier on real test data and generated data
y_pred_real = clf.predict(X_test)
print("Performance on real data:")
print(classification_report(y_test, y_pred_real))

# Generate fake data and evaluate
generated_samples = generator.predict(np.random.normal(size=(X_test.shape[0], latent_dim)))
y_pred_generated = clf.predict(generated_samples)
print("Performance on generated data:")
print(classification_report(y_test, y_pred_generated))


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import numpy as np

# Assuming you have stored generator and discriminator losses during training
def plot_loss(generator_loss, discriminator_loss):
    plt.figure(figsize=(10, 5))
    plt.plot(generator_loss, label='Generator Loss')
    plt.plot(discriminator_loss, label='Discriminator Loss')
    plt.title('Generator and Discriminator Loss During Training')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# 1. Plot loss curves
# Example lists for generator and discriminator loss, these would come from training
generator_loss = np.random.randn(1000)  # replace with actual generator loss values
discriminator_loss = np.random.randn(1000)  # replace with actual discriminator loss values
plot_loss(generator_loss, discriminator_loss)

# 2. Real vs Generated Data Distribution
def plot_distribution(real_data, generated_data, feature_name):
    plt.figure(figsize=(10, 5))
    sns.kdeplot(real_data, label='Real', shade=True)
    sns.kdeplot(generated_data, label='Generated', shade=True)
    plt.title(f'Real vs Generated Data Distribution for {feature_name}')
    plt.xlabel('Feature Value')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

# Example real and generated data, use your real dataset and generated samples
real_samples = X.sample(1000).values  # real data
latent_vectors = np.random.normal(size=(1000, latent_dim))
generated_samples = generator.predict(latent_vectors)  # generated data

# Plot for a selected feature (e.g., the first feature)
plot_distribution(real_samples[:, 0], generated_samples[:, 0], 'Feature 1')

# 3. t-SNE Visualization
def plot_tsne(real_data, generated_data):
    tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
    real_tsne = tsne.fit_transform(real_data)
    generated_tsne = tsne.fit_transform(generated_data)

    plt.figure(figsize=(10, 5))
    plt.scatter(real_tsne[:, 0], real_tsne[:, 1], label='Real Data', alpha=0.6)
    plt.scatter(generated_tsne[:, 0], generated_tsne[:, 1], label='Generated Data', alpha=0.6)
    plt.title('t-SNE Visualization of Real and Generated Data')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend()
    plt.show()

# Apply t-SNE on the first 1000 real and generated samples
plot_tsne(real_samples[:1000], generated_samples[:1000])


## Tuning Model with WGAN-GP Implementation

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

# Set up your generator and discriminator models
def build_generator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, input_dim=latent_dim, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(1024, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(input_dim, activation='tanh'))
    return model

def build_discriminator():
    model = tf.keras.Sequential()
    model.add(layers.Dense(1024, input_dim=input_dim))
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Dense(256))
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Dense(1))
    return model

# Wasserstein Loss
def wasserstein_loss(y_true, y_pred):
    return tf.keras.backend.mean(y_true * y_pred)

# Gradient Penalty
def gradient_penalty(real_samples, generated_samples, batch_size, critic):
    alpha = tf.random.uniform([batch_size, 1], 0.0, 1.0)
    interpolated = alpha * real_samples + (1 - alpha) * generated_samples
    
    with tf.GradientTape() as tape:
        tape.watch(interpolated)
        prediction = critic(interpolated)
    
    gradients = tape.gradient(prediction, interpolated)
    gradient_l2_norm = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=[1]))
    gradient_penalty = tf.reduce_mean((gradient_l2_norm - 1.0) ** 2)
    
    return gradient_penalty

# Model Training Loop (with GPU acceleration)
def train_wgan_gp(generator, discriminator, epochs, batch_size, latent_dim, X_train, lambda_gp=10):
    # Create optimizers
    generator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5)
    discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5)
    
    # Begin training loop
    for epoch in range(epochs):
        for _ in range(5):  # Critic (Discriminator) updates more frequently than Generator
            real_samples = X_train.sample(batch_size).values
            latent_vectors = np.random.normal(size=(batch_size, latent_dim))
            generated_samples = generator.predict(latent_vectors)
            
            with tf.device('/GPU:0'):  # Ensure operations run on GPU
                # Train the critic
                with tf.GradientTape() as tape:
                    real_pred = discriminator(real_samples)
                    generated_pred = discriminator(generated_samples)
                    
                    gp = gradient_penalty(real_samples, generated_samples, batch_size, discriminator)
                    critic_loss = tf.reduce_mean(generated_pred) - tf.reduce_mean(real_pred) + lambda_gp * gp
                
                grads = tape.gradient(critic_loss, discriminator.trainable_weights)
                discriminator_optimizer.apply_gradients(zip(grads, discriminator.trainable_weights))

        # Train the generator
        latent_vectors = np.random.normal(size=(batch_size, latent_dim))
        with tf.device('/GPU:0'):
            with tf.GradientTape() as tape:
                generated_samples = generator(latent_vectors)
                generated_pred = discriminator(generated_samples)
                generator_loss = -tf.reduce_mean(generated_pred)
            
            grads = tape.gradient(generator_loss, generator.trainable_weights)
            generator_optimizer.apply_gradients(zip(grads, generator.trainable_weights))

        # Logging the progress
        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{epochs}, Critic Loss: {critic_loss.numpy()}, Generator Loss: {generator_loss.numpy()}")

# Set parameters
latent_dim = 100  # Dimension of the latent space
input_dim = X_train.shape[1]  # Number of input features
epochs = 10000
batch_size = 64

# Build generator and discriminator
generator = build_generator()
discriminator = build_discriminator()

# Start training WGAN with GPU acceleration
train_wgan_gp(generator, discriminator, epochs, batch_size, latent_dim, X_train)


## Eval WGAN-GP

In [None]:
import matplotlib.pyplot as plt

def plot_loss(generator_loss, critic_loss):
    plt.figure(figsize=(10, 6))
    plt.plot(generator_loss, label='Generator Loss')
    plt.plot(critic_loss, label='Critic Loss')
    plt.title('Generator and Critic Loss During Training')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

# Example lists of losses over epochs
generator_loss = [0.5, 0.3, 0.2, 0.15, 0.12]  # replace with actual loss values
critic_loss = [-0.7, -0.5, -0.35, -0.3, -0.25]  # replace with actual loss values

plot_loss(generator_loss, critic_loss)


In [None]:
import seaborn as sns
import numpy as np

# Example function to plot real vs generated distributions
def plot_feature_distribution(real_data, generated_data, feature_name):
    plt.figure(figsize=(10, 6))
    sns.kdeplot(real_data, label='Real Data', shade=True)
    sns.kdeplot(generated_data, label='Generated Data', shade=True)
    plt.title(f'Distribution of {feature_name}: Real vs Generated')
    plt.xlabel('Feature Value')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

# Replace with actual feature columns from real and generated samples
real_samples = X_train.sample(1000).values[:, 0]  # First feature of real samples
latent_vectors = np.random.normal(size=(1000, latent_dim))
generated_samples = generator.predict(latent_vectors)
generated_samples_feature = generated_samples[:, 0]  # First feature of generated samples

plot_feature_distribution(real_samples, generated_samples_feature, "Feature 1")


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_tsne(real_data, generated_data, title='t-SNE Visualization'):
    tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
    real_tsne = tsne.fit_transform(real_data)
    generated_tsne = tsne.fit_transform(generated_data)

    plt.figure(figsize=(10, 6))
    plt.scatter(real_tsne[:, 0], real_tsne[:, 1], label='Real Data', alpha=0.5)
    plt.scatter(generated_tsne[:, 0], generated_tsne[:, 1], label='Generated Data', alpha=0.5)
    plt.title(title)
    plt.legend()
    plt.show()

# Run t-SNE on real and generated samples
real_samples_tsne = X_train.sample(1000).values  # Sample real data
generated_samples_tsne = generator.predict(np.random.normal(size=(1000, latent_dim)))

plot_tsne(real_samples_tsne, generated_samples_tsne, title="t-SNE: Real vs Generated Data")


In [None]:
def plot_generated_samples(generator, latent_dim, n_samples=1000, title="Generated Samples"):
    latent_vectors = np.random.normal(size=(n_samples, latent_dim))
    generated_samples = generator.predict(latent_vectors)
    
    # For visualization, we could use the first 2 dimensions if possible
    plt.figure(figsize=(10, 6))
    plt.scatter(generated_samples[:, 0], generated_samples[:, 1], alpha=0.6, color='blue')
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.grid(True)
    plt.show()

# Call this function after training for several epochs
plot_generated_samples(generator, latent_dim, title="Generated Samples After Tuning")


In [None]:
def plot_generated_samples_over_epochs(generator, latent_dim, n_epochs, n_samples=1000):
    for epoch in range(0, n_epochs, 100):  # Plot every 100 epochs
        latent_vectors = np.random.normal(size=(n_samples, latent_dim))
        generated_samples = generator.predict(latent_vectors)
        
        plt.figure(figsize=(10, 6))
        plt.scatter(generated_samples[:, 0], generated_samples[:, 1], alpha=0.6, label=f'Epoch {epoch}')
        plt.title(f'Generated Samples at Epoch {epoch}')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.legend()
        plt.show()

# Track the diversity of generated samples over epochs
plot_generated_samples_over_epochs(generator, latent_dim, n_epochs=1000)
