In [95]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
np.random.seed(42)
tf.random.set_seed(42)


In [96]:
class GAN:
    def __init__(self, latent_dim, data_dim):
        self.latent_dim = latent_dim
        self.data_dim = data_dim

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy',
                                   optimizer=Adam(learning_rate=0.0002, beta_1=0.5),
                                   metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        # The generator takes noise as input and generates data
        z = Input(shape=(self.latent_dim,))
        generated_data = self.generator(z)

        # For the combined model, we will only train the generator
        self.discriminator.trainable = False

        # The discriminator takes generated data as input and determines validity
        validity = self.discriminator(generated_data)

        # The combined model (stacked generator and discriminator)
        self.combined = Model(z, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001, beta_1=0.5))

        # Reset the discriminator's trainable attribute to True after compiling the combined model
        self.discriminator.trainable = True

        # Debugging: Check trainable weights
        print("Discriminator trainable weights:", self.discriminator.trainable_weights)
        print("Generator trainable weights:", self.generator.trainable_weights)
        print("Combined model trainable weights:", self.combined.trainable_weights)
    def build_generator(self):
        model = Sequential()
        model.add(Input(shape=(self.latent_dim,)))
        model.add(Dense(256))
        model.add(LeakyReLU(negative_slope=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(self.data_dim, activation='sigmoid'))
        model.summary()

        noise = Input(shape=(self.latent_dim,))
        data = model(noise)

        return Model(noise, data)

    def build_discriminator(self):
        model = Sequential()
        model.add(Input(shape=(self.data_dim,)))
        model.add(Dense(256))
        model.add(LeakyReLU(negative_slope=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        data = Input(shape=(self.data_dim,))
        validity = model(data)

        return Model(data, validity)

    def train(self, data, epochs, batch_size):
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):
            # Train Discriminator
            idx = np.random.randint(0, data.shape[0], batch_size)
            real_data = data[idx]  # Remove .todense() as data is already a dense array

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            generated_data = self.generator.predict(noise)

            d_loss_real = self.discriminator.train_on_batch(real_data, valid)
            d_loss_fake = self.discriminator.train_on_batch(generated_data, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            g_loss = self.combined.train_on_batch(noise, valid)

            # Print the losses
            print(f"{epoch} [D loss: {d_loss}] [G loss: {g_loss}]")

In [97]:
# Load CSV files and merge them
ads_data_path = r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\train_data_ads.csv'  # Replace with actual file path
feeds_data_path = r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\train_data_feeds.csv'  # Replace with actual file path

ads_data = pd.read_csv(ads_data_path)
feeds_data = pd.read_csv(feeds_data_path)

In [98]:
# Fractions of data used as loading both of them in full raises a memory allocation error due to large size
sample_ads_data = ads_data.sample(frac=.01, random_state=42)
sample_feeds_data = feeds_data.sample(frac=.01, random_state=42)

In [99]:
click = pd.merge(sample_ads_data, sample_feeds_data, left_on='user_id', right_on='u_userId')

In [100]:
click.head()

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,e_ch,e_m,e_po,e_pl,e_rn,e_section,e_et,label_y,cillabel,pro
0,389752,0,192266,2,2,46,354,2,11,8,...,19,320,2,214,2,0,202206080822,-1,-1,0
1,389752,0,192266,2,2,46,354,2,11,8,...,19,320,9,214,6,0,202206032022,-1,-1,0
2,237151,0,231175,6,4,33,319,3,27,2,...,19,1185,16,1830,1,0,202206051726,-1,-1,0
3,963804,0,203833,8,2,30,113,5,31,3,...,19,565,10,1396,57,0,202206031220,-1,-1,0
4,376817,0,115573,2,2,27,162,5,30,3,...,19,327,7,2073,2,0,202206070922,1,-1,100


In [101]:
size = 5  # Every list seperated by ^ is at max 5 long 

def normalize_list_length(lst, max_length):
    if len(lst) > max_length:  # If the list is longer than the fixed length
        return lst[:max_length]  # Truncate the list
    else:  # If the list is shorter than the fixed length
        return lst + ['0'] * (max_length - len(lst))

def split_list_column(df, column_name, max_length):
    # Ensure the column contains lists before processing
    if df[column_name].apply(lambda x: isinstance(x, list)).all():
        # Create a DataFrame from the lists with appropriate column names
        split_columns = pd.DataFrame(df[column_name].tolist(),
                                     columns=[f'{column_name}_{i+1}' for i in range(max_length)])
        # Drop the original column and join the new columns
        df = df.drop(columns=[column_name]).join(split_columns)
    return df


for col in click.columns: # This creates necessary columns 
    if click[col].dtype == object:  
        max_length = click[col].apply(lambda x: x.count('^') + 1 if isinstance(x, str) else 1).max() 
        click[col] = click[col].apply(lambda x: x.split('^') if isinstance(x, str) else x)
        click[col] = click[col].apply(lambda x: normalize_list_length(x, max_length) if isinstance(x, list) else x)
        click = split_list_column(click, col, max_length)

click = click.drop(columns=['i_entities', 'i_docId_1', 'i_s_sourceId_1']) # Drop strings for now
click.head()


Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,u_newsCatInterestsST_y_1,u_newsCatInterestsST_y_2,u_newsCatInterestsST_y_3,u_newsCatInterestsST_y_4,u_newsCatInterestsST_y_5,u_click_ca2_news_1,u_click_ca2_news_2,u_click_ca2_news_3,u_click_ca2_news_4,u_click_ca2_news_5
0,389752,0,192266,2,2,46,354,2,11,8,...,85,65,152,116,177,85,152,65,116,25
1,389752,0,192266,2,2,46,354,2,11,8,...,173,50,123,0,0,173,50,123,80,114
2,237151,0,231175,6,4,33,319,3,27,2,...,112,109,44,153,157,112,17,109,78,153
3,963804,0,203833,8,2,30,113,5,31,3,...,168,167,65,0,0,168,167,219,65,109
4,376817,0,115573,2,2,27,162,5,30,3,...,199,25,171,65,104,65,104,131,199,66


In [102]:
for column in click.columns: # Convert to integer
    if click[column].dtype == object:
        click[column] = click[column].astype(int)

In [117]:
scaler = MinMaxScaler(feature_range=(-1, 1))  # Use MinMaxScaler for normalization

# Normalize the data
click_array = scaler.fit_transform(click.values)

# Define GAN parameters
latent_dim = 100
data_dim = click_array.shape[1]

# Instantiate and train the GAN
gan = GAN(latent_dim, data_dim)
gan.train(click_array, epochs=200, batch_size=128)  # Choose the amount of runs 

# Generate synthetic data
num_samples = 10000  # Adjust the number of samples as needed
noise = np.random.normal(0, 1, (num_samples, latent_dim))
synthetic_data = gan.generator.predict(noise)

# Inverse transform the synthetic data
synthetic_data = scaler.inverse_transform(synthetic_data)

# Handle NaN and infinite values before rounding
synthetic_data = np.nan_to_num(synthetic_data, nan=0.0, posinf=0.0, neginf=0.0)

# Clip the data to a reasonable range, e.g., [0, max_value]
max_value = click.max().max()  # Get max of numeric columns
synthetic_data = np.clip(synthetic_data, 0, max_value)

# Round the data and convert to integers
synthetic_data = np.round(synthetic_data).astype(int)

# Convert synthetic data back to a DataFrame
synthetic_click = pd.DataFrame(synthetic_data, columns=click.columns)

# Debug: Check the shape and head of the generated data
print(f"Shape of generated data: {synthetic_click.shape}")
synthetic_click.head()

Discriminator trainable weights: [<KerasVariable shape=(104, 256), dtype=float32, path=sequential_18/dense_36/kernel>, <KerasVariable shape=(256,), dtype=float32, path=sequential_18/dense_36/bias>, <KerasVariable shape=(256, 1), dtype=float32, path=sequential_18/dense_37/kernel>, <KerasVariable shape=(1,), dtype=float32, path=sequential_18/dense_37/bias>]
Generator trainable weights: [<KerasVariable shape=(100, 256), dtype=float32, path=sequential_19/dense_38/kernel>, <KerasVariable shape=(256,), dtype=float32, path=sequential_19/dense_38/bias>, <KerasVariable shape=(256,), dtype=float32, path=sequential_19/batch_normalization_9/gamma>, <KerasVariable shape=(256,), dtype=float32, path=sequential_19/batch_normalization_9/beta>, <KerasVariable shape=(256, 104), dtype=float32, path=sequential_19/dense_39/kernel>, <KerasVariable shape=(104,), dtype=float32, path=sequential_19/dense_39/bias>]
Combined model trainable weights: [<KerasVariable shape=(100, 256), dtype=float32, path=sequential_

  synthetic_data = np.round(synthetic_data).astype(int)


Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,u_newsCatInterestsST_y_1,u_newsCatInterestsST_y_2,u_newsCatInterestsST_y_3,u_newsCatInterestsST_y_4,u_newsCatInterestsST_y_5,u_click_ca2_news_1,u_click_ca2_news_2,u_click_ca2_news_3,u_click_ca2_news_4,u_click_ca2_news_5
0,958738,1,202930,6,4,32,276,4,28,6,...,118,172,156,147,193,187,176,163,117,207
1,799086,1,198039,6,3,38,325,4,29,8,...,163,196,184,195,215,129,143,159,163,147
2,1049208,1,255956,7,3,32,422,5,30,7,...,136,143,156,198,201,120,163,119,197,204
3,801919,1,250234,9,3,35,283,5,31,7,...,129,181,182,214,144,174,149,189,182,141
4,1073890,1,276091,7,3,32,409,4,26,6,...,174,133,192,152,182,118,210,182,152,131


In [118]:
click.describe()

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,u_newsCatInterestsST_y_1,u_newsCatInterestsST_y_2,u_newsCatInterestsST_y_3,u_newsCatInterestsST_y_4,u_newsCatInterestsST_y_5,u_click_ca2_news_1,u_click_ca2_news_2,u_click_ca2_news_3,u_click_ca2_news_4,u_click_ca2_news_5
count,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,...,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0,73104.0
mean,545695.3,0.008632,193422.610185,5.47271,2.375602,26.585248,268.858352,3.404643,24.455666,4.586042,...,122.068765,110.355753,105.502339,103.364385,102.512735,114.086001,111.556358,109.599653,108.906298,107.884822
std,318106.1,0.092505,54052.26847,2.173391,0.735787,9.272219,98.773152,1.284477,7.760142,1.942284,...,65.173354,63.849116,62.077208,62.371114,61.594837,65.250141,64.075987,63.348185,64.481685,64.559538
min,4.0,0.0,100022.0,2.0,2.0,11.0,101.0,2.0,11.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,268872.2,0.0,145387.0,3.0,2.0,20.0,175.0,2.0,16.0,3.0,...,78.0,65.0,62.0,57.0,57.0,65.0,63.0,62.0,57.0,56.0
50%,542515.5,0.0,192789.0,6.0,2.0,26.0,297.0,3.0,27.0,5.0,...,112.0,98.0,98.0,98.0,98.0,108.0,104.0,100.0,104.0,104.0
75%,817633.0,0.0,240004.0,7.0,2.0,33.0,342.0,5.0,31.0,6.0,...,171.0,168.0,157.0,155.0,152.0,171.0,168.0,168.0,168.0,168.0
max,1176443.0,1.0,287127.0,9.0,4.0,46.0,441.0,5.0,37.0,8.0,...,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0


In [119]:
synthetic_click.describe()

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,u_newsCatInterestsST_y_1,u_newsCatInterestsST_y_2,u_newsCatInterestsST_y_3,u_newsCatInterestsST_y_4,u_newsCatInterestsST_y_5,u_click_ca2_news_1,u_click_ca2_news_2,u_click_ca2_news_3,u_click_ca2_news_4,u_click_ca2_news_5
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,878516.9,1.0,239951.4498,7.153,3.4521,36.6278,353.5839,4.2758,30.4099,6.4938,...,164.2001,162.0064,165.799,164.3321,164.4378,162.9362,163.0698,165.9927,164.465,165.661
std,158324.0,0.0,25696.971946,0.940999,0.497725,4.2744,47.132731,0.446939,3.587,0.952393,...,29.549037,30.316093,29.746447,29.952371,31.65908,27.403216,30.20883,28.093849,29.335341,30.306731
min,589234.0,1.0,193793.0,6.0,3.0,29.0,272.0,4.0,24.0,5.0,...,110.0,110.0,110.0,110.0,110.0,111.0,111.0,111.0,110.0,110.0
25%,741951.5,1.0,217763.75,6.0,3.0,33.0,312.0,4.0,27.0,6.0,...,139.0,135.0,140.0,138.0,136.0,140.0,136.0,143.0,139.0,139.0
50%,876798.5,1.0,240142.0,7.0,3.0,36.0,352.0,4.0,30.0,6.0,...,164.0,160.0,167.0,164.0,164.0,162.0,162.0,166.0,164.0,166.0
75%,1015505.0,1.0,262100.25,8.0,4.0,40.0,394.0,5.0,34.0,7.0,...,190.0,188.0,191.0,190.0,193.0,185.0,189.0,190.0,190.0,193.0
max,1173133.0,1.0,286694.0,9.0,4.0,46.0,441.0,5.0,37.0,8.0,...,220.0,220.0,220.0,220.0,220.0,219.0,220.0,219.0,220.0,219.0


In [113]:
click.to_csv(r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\real_click.csv', index=False)
synthetic_click.to_csv(r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\synthetic_click.csv', index=False)