In [1]:
# prompt: use csv file from gdrive and convert to dataframe
#import pandas as pd

#from google.colab import drive
#drive.mount('/content/gdrive')

import pandas as pd
df = pd.read_csv('online_gaming_behavior_dataset.csv')


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PlayerID                   40034 non-null  int64  
 1   Age                        40034 non-null  int64  
 2   Gender                     40034 non-null  object 
 3   Location                   40034 non-null  object 
 4   GameGenre                  40034 non-null  object 
 5   PlayTimeHours              40034 non-null  float64
 6   InGamePurchases            40034 non-null  int64  
 7   GameDifficulty             40034 non-null  object 
 8   SessionsPerWeek            40034 non-null  int64  
 9   AvgSessionDurationMinutes  40034 non-null  int64  
 10  PlayerLevel                40034 non-null  int64  
 11  AchievementsUnlocked       40034 non-null  int64  
 12  EngagementLevel            40034 non-null  object 
dtypes: float64(1), int64(7), object(5)
memory usag

In [3]:
df.nunique()

PlayerID                     40034
Age                             35
Gender                           2
Location                         4
GameGenre                        5
PlayTimeHours                40034
InGamePurchases                  2
GameDifficulty                   3
SessionsPerWeek                 20
AvgSessionDurationMinutes      170
PlayerLevel                     99
AchievementsUnlocked            50
EngagementLevel                  3
dtype: int64

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PlayerID,40034.0,29016.5,11556.964675,9000.0,19008.25,29016.5,39024.75,49033.0
Age,40034.0,31.992531,10.043227,15.0,23.0,32.0,41.0,49.0
PlayTimeHours,40034.0,12.024365,6.914638,0.000115,6.067501,12.008002,17.963831,23.999592
InGamePurchases,40034.0,0.200854,0.400644,0.0,0.0,0.0,0.0,1.0
SessionsPerWeek,40034.0,9.471774,5.763667,0.0,4.0,9.0,14.0,19.0
AvgSessionDurationMinutes,40034.0,94.792252,49.011375,10.0,52.0,95.0,137.0,179.0
PlayerLevel,40034.0,49.655568,28.588379,1.0,25.0,49.0,74.0,99.0
AchievementsUnlocked,40034.0,24.526477,14.430726,0.0,12.0,25.0,37.0,49.0


In [5]:
import pandas as pd
import numpy as np

def identify_outliers(df, multiplier=1.5):
    numeric_df = df.select_dtypes(include=['number'])

    Q1 = numeric_df.quantile(0.25)
    Q3 = numeric_df.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (multiplier * IQR)
    upper_bound = Q3 + (multiplier * IQR)

    return ((numeric_df < lower_bound) | (numeric_df > upper_bound)).sum()

def count_outliers(df):
    return identify_outliers(df).sum()


In [6]:
df.select_dtypes(include=['number']).corr()

Unnamed: 0,PlayerID,Age,PlayTimeHours,InGamePurchases,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked
PlayerID,1.0,-0.003044,0.000923,0.002321,-0.005944,-0.001801,-0.001769,0.00319
Age,-0.003044,1.0,0.002462,-0.000186,0.008777,-0.002269,0.001353,-0.0011
PlayTimeHours,0.000923,0.002462,1.0,-0.006067,-0.003655,-0.001925,-0.005152,0.003913
InGamePurchases,0.002321,-0.000186,-0.006067,1.0,0.005132,-0.003059,0.006524,9.8e-05
SessionsPerWeek,-0.005944,0.008777,-0.003655,0.005132,1.0,-0.00062,0.003257,0.003187
AvgSessionDurationMinutes,-0.001801,-0.002269,-0.001925,-0.003059,-0.00062,1.0,0.001368,-0.002227
PlayerLevel,-0.001769,0.001353,-0.005152,0.006524,0.003257,0.001368,1.0,0.006343
AchievementsUnlocked,0.00319,-0.0011,0.003913,9.8e-05,0.003187,-0.002227,0.006343,1.0


In [7]:
from scipy.stats import chisquare

# Assuming 'df' is your DataFrame
df_c = df.select_dtypes(include=['object']).apply(lambda x: pd.factorize(x)[0] + 1)

result = pd.DataFrame([
    chisquare(df_c[col].values)[0] for col in df_c.columns
], index=df_c.columns, columns=['chi_square_statistic'])

print(result)

                 chi_square_statistic
Gender                    6864.155929
Location                 12900.819203
GameGenre                26694.030780
GameDifficulty           10325.598330
EngagementLevel          15587.980963


# Dividing the Original Data

In [3]:
from sklearn.utils import shuffle
df = shuffle(df)

In [4]:
Seed =df[0:int(df.shape[0]/2)]
Hold_out =df[int(df.shape[0]/2):]

# Synthetic Data with GAN



In [1]:
pip install ctgan

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
from ctgan import CTGAN

Making sure the data types are accepted input

In [12]:
for i in df.columns:
  if df[i].dtype=='object':
    df[i] = df[i].astype('category')
  elif df[i].dtype=='int64':
    df[i] = df[i].astype('float64')

In [1]:
# Step 1: Identify categorical columns
categorical_columns = Seed.select_dtypes(include=['object', 'category']).columns.tolist()

# Step 2: Initialize the CTGAN model
ctgan = CTGAN()

# Step 3: Fit the CTGAN model to the data
ctgan.fit(Seed, discrete_columns=categorical_columns)

# Step 4: Generate synthetic data
synthetic_data = ctgan.sample(len(Seed))

NameError: name 'Seed' is not defined

In [None]:
synthetic_data_GAN = pd.DataFrame(synthetic_data)

In [None]:
synthetic_data_GAN.to_csv('Synthetic_Data_GAN.cw')

# Synthetic Data With Transformer

In [None]:
pip install realtabformer

Defaulting to user installation because normal site-packages is not writeable


In [None]:
from realtabformer import REaLTabFormer

In [None]:
import torch
import subprocess

def get_cuda_version():
    try:
        output = subprocess.check_output(['nvidia-smi', '--query-gpu=driver_version,cuda_version', '--format=csv,noheader'])
        output = output.decode('utf-8').strip().split(',')
        return output[0], output[1]
    except:
        return "N/A", "N/A"

def main():
    print("Checking CUDA and PyTorch compatibility...")
    
    # Check CUDA version
    driver_version, cuda_version = get_cuda_version()
    print(f"NVIDIA Driver Version: {driver_version}")
    print(f"CUDA Version: {cuda_version}")

    # Check PyTorch version
    print(f"PyTorch Version: {torch.__version__}")
    print(f"PyTorch CUDA Version: {torch.version.cuda}")

    # Check CUDA availability
    cuda_available = torch.cuda.is_available()
    print(f"CUDA Available: {cuda_available}")

    if cuda_available:
        print(f"Current CUDA Device: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA is not available. Please check your installation.")

    print("\nNOTE: Ensure that the CUDA version reported by PyTorch is compatible with your installed CUDA version.")
    print("If they don't match, you may need to reinstall PyTorch with the correct CUDA version.")

if __name__ == "__main__":
    main()

In [None]:


# Load your dataset
# Preprocess the data
# Ensure to remove or anonymize any unique identifiers or sensitive information
# that you don't want to be modeled.

# Initialize the REaLTabFormer model
rtf_model = REaLTabFormer(
    model_type="tabular",
    gradient_accumulation_steps=1,
    logging_steps=500,
    batch_size=32            # Frequency of logging
    # Use GPU if available
)

# Train the model
rtf_model.fit(Seed)

# Generate synthetic data
synthetic_data = rtf_model.sample(n_samples=len(Seed))

# Save or use the synthetic data
synthetic_data.to_csv("synthetic_data_TF_new.csv", index=False)