In [344]:
import pandas as pd
import numpy as np
from faker import Faker
import matplotlib.pyplot as plt

In [345]:


def generate_fake_data(num_customers, max_repeats):
  fake = Faker()
  data = []

  for _ in range(num_customers):
    customer_id = fake.random_int(min=1000, max=9999)
    first_name = fake.first_name()
    last_name = fake.last_name()
    country = fake.country()
    age = fake.random_int(min=18, max=80)
    gender = fake.random_element(elements=('Male', 'Female'))
    income_level = fake.random_int(min=20000, max=150000)

    for _ in range(fake.random_int(min=1, max=max_repeats)):
      data.append([customer_id, first_name, last_name, country, age, gender, income_level])

  df = pd.DataFrame(data, columns=['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender', 'income_level'])
  return df

# Example usage:
num_customers = 5000
max_repeats = 15
fake_data = generate_fake_data(num_customers, max_repeats)
fake_data = fake_data[:5000]
print(fake_data)


      customer_id first_name last_name    country  age  gender  income_level
0            5802   Kimberly    Campos      Kenya   37  Female         52701
1            5802   Kimberly    Campos      Kenya   37  Female         52701
2            5802   Kimberly    Campos      Kenya   37  Female         52701
3            5802   Kimberly    Campos      Kenya   37  Female         52701
4            5802   Kimberly    Campos      Kenya   37  Female         52701
...           ...        ...       ...        ...  ...     ...           ...
4995         2187      Tracy    Kelley  Sri Lanka   22    Male        145999
4996         2187      Tracy    Kelley  Sri Lanka   22    Male        145999
4997         2187      Tracy    Kelley  Sri Lanka   22    Male        145999
4998         2187      Tracy    Kelley  Sri Lanka   22    Male        145999
4999         2187      Tracy    Kelley  Sri Lanka   22    Male        145999

[5000 rows x 7 columns]


In [346]:

def shuffle_data(df):
    # Create a unique identifier for each customer
    df['customer_group'] = df.groupby(['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender', 'income_level']).ngroup()

    # Shuffle the DataFrame
    df = df.sample(frac=1)

    # Drop the temporary customer_group column
    df.drop(columns=['customer_group'], inplace=True)

    return df


In [347]:
fake_data_1 = shuffle_data(fake_data)

In [348]:
def generate_customer_data(num_rows):
  fake = Faker()
  
  data = {
      'Winning_percentage':np.random.randint(5,80,size=num_rows),
      'Days_Since_Last_Bet':np.random.randint(1,40,size=num_rows)
  }
  df = pd.DataFrame(data)
  return df
num_rows = 5000
df1 = generate_customer_data(num_rows)
print(df1.head())

   Winning_percentage  Days_Since_Last_Bet
0                  45                   29
1                  21                   25
2                  63                   36
3                   9                   27
4                  77                   23


In [349]:
df = pd.concat([fake_data_1,df1],axis=1)

In [350]:
df.head()

Unnamed: 0,customer_id,first_name,last_name,country,age,gender,income_level,Winning_percentage,Days_Since_Last_Bet
2957,9732,Angel,Nelson,Nauru,41,Female,122531,34,9
4116,9304,Craig,Obrien,Lithuania,27,Female,55152,38,37
1904,7999,Rachael,Hinton,China,61,Female,65940,42,37
4897,2543,Amy,Perez,Algeria,78,Female,112684,58,20
3303,2021,Linda,Stevens,Tokelau,69,Male,51871,70,35


In [298]:


def generate_right_skewed_integers(num_samples, min_val, max_val, scale_factor=10):
  data = np.random.exponential(scale=scale_factor, size=num_samples)
  data = data / data.max() * (max_val - min_val) + min_val
  data = np.rint(data).astype(int)
  return data

In [299]:
num_samples = 5000
min_val = 100
max_val = 100000
df["Total_Amount_wagered"] = generate_right_skewed_integers(num_samples, min_val, max_val, scale_factor=5)
print(df["Total_Amount_wagered"])


958     23332
4587     9468
4061     2466
1255    24609
212      6083
        ...  
1456     2426
1411    25851
1112     2006
4566     1747
2035     2007
Name: Total_Amount_wagered, Length: 5000, dtype: int32


In [199]:
num_samples = 5000
min_val = 10
max_val = 1000
df["Average_Bet_Amount"] = generate_right_skewed_integers(num_samples, min_val, max_val, scale_factor=5)
print(df["Average_Bet_Amount"])

3227     12
2948    272
2042    103
211     178
3667     20
       ... 
4416     28
1237    171
4052    130
1465     39
981      96
Name: Average_Bet_Amount, Length: 5000, dtype: int32


In [351]:
data = pd.read_csv("my_data_3.csv")

In [352]:
data["Bet_frequency"].value_counts()

Bet_frequency
Monthly         1589
Weekly          1504
Daily           1009
Occasionally     682
No               216
Name: count, dtype: int64

In [353]:
import pandas as pd
import numpy as np

def generate_random_number(feature_value):
  mapping = {
      'Daily': np.random.randint(300, 355),
      'Weekly': np.random.randint(45, 55),
      'Monthly': np.random.randint(7, 15),
      'Occasionally': np.random.randint(5, 10),
      'No': np.random.randint(3,9)
  }
  return mapping.get(feature_value, np.random.randint(0, 1)) 

# Apply the function to create a new column
df['Active_Days'] = data['Bet_frequency'].apply(generate_random_number)

print(df['Active_Days'] )


2957    318
4116      8
1904     11
4897    305
3303      7
       ... 
2635    322
69        7
1647     46
3762      8
3269     12
Name: Active_Days, Length: 5000, dtype: int64


In [354]:
df.head()

Unnamed: 0,customer_id,first_name,last_name,country,age,gender,income_level,Winning_percentage,Days_Since_Last_Bet,Active_Days
2957,9732,Angel,Nelson,Nauru,41,Female,122531,34,9,318
4116,9304,Craig,Obrien,Lithuania,27,Female,55152,38,37,8
1904,7999,Rachael,Hinton,China,61,Female,65940,42,37,11
4897,2543,Amy,Perez,Algeria,78,Female,112684,58,20,305
3303,2021,Linda,Stevens,Tokelau,69,Male,51871,70,35,7


In [355]:
df.columns

Index(['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender',
       'income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days'],
      dtype='object')

Generating correlation with 2 features

In [356]:

import pandas as pd
import numpy as np

def generate_correlated_feature_2(data, feature1, feature2, correlation_coeff1, correlation_coeff2, noise_level):

  # Extract features as NumPy arrays
  x1 = data[feature1].values
  x2 = data[feature2].values

  # Standardize features
  x1_std = (x1 - np.mean(x1)) / np.std(x1)
  x2_std = (x2 - np.mean(x2)) / np.std(x2)

  # Create correlation matrix
  corr_matrix = np.array([[1, correlation_coeff1], [correlation_coeff1, 1]])

  # Cholesky decomposition
  L = np.linalg.cholesky(corr_matrix)

  # Generate correlated random data
  correlated_data = np.random.randn(len(data), 2) @ L

  # Scale correlated data
  scaled_correlated_data = correlated_data * np.array([np.std(x1), np.std(x2)]) + np.array([np.mean(x1), np.mean(x2)])

  # Create the new feature
  new_feature = scaled_correlated_data[:, 0] + noise_level * np.random.randn(len(data))

  return pd.Series(new_feature, index=data.index)



Genrating correlation with 3 features

In [357]:
import numpy as np

def generate_correlated_feature(data, feature1, feature2, feature3, correlation_coeff1, correlation_coeff2, correlation_coeff3, noise_level):
    std1 = np.std(data[feature1])
    std2 = np.std(data[feature2])
    std3 = np.std(data[feature3])
    
    cov12 = correlation_coeff1 * std1 * std2
    cov13 = correlation_coeff1 * std1 * std3
    cov23 = correlation_coeff2 * std2 * std3
    corr_matrix = np.corrcoef(data[[feature1, feature2, feature3]], rowvar=False)
    var_noise = 1 - correlation_coeff1**2 - correlation_coeff2**2 - correlation_coeff3**2 \
               - 2 * (cov12 * corr_matrix[0, 1] + cov13 * corr_matrix[0, 2] + cov23 * corr_matrix[1, 2])
    var_noise = abs(var_noise)
    sigma_noise = np.sqrt(var_noise)
    
    new_feature = correlation_coeff1 * (data[feature1] - np.mean(data[feature1])) / std1 + \
                  correlation_coeff2 * (data[feature2] - np.mean(data[feature2])) / std2 + \
                  correlation_coeff3 * (data[feature3] - np.mean(data[feature3])) / std3 + \
                  sigma_noise * np.random.randn(len(data))
    return new_feature



In [358]:
df.columns

Index(['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender',
       'income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days'],
      dtype='object')

In [363]:
df["Total_Number_of_Bets"]= generate_correlated_feature(df,"income_level","Active_Days","Days_Since_Last_Bet",0.7,0.8,-0.8, 0.9)

In [364]:
df["Total_Amount_Wagered"] = generate_correlated_feature_2(df,'income_level','Total_Number_of_Bets',0.7,0.8,0.9)


In [365]:
df[["Total_Number_of_Bets","Total_Amount_Wagered"]].corr()

Unnamed: 0,Total_Number_of_Bets,Total_Amount_Wagered
Total_Number_of_Bets,1.0,-0.019307
Total_Amount_Wagered,-0.019307,1.0


In [324]:
df["Average_Bet_Amount"] = generate_correlated_feature_2(df,'Total_Amount_Wagered','Total_Number_of_Bets',0.4,0.3,0.9)

In [325]:
df["Number_of_Bonuses_Received"] = generate_correlated_feature_2(df,'Total_Amount_Wagered','Average_Bet_Amount',0.4,0.4,0.9)

In [326]:
df["Amount_of_Bonuses_Received"] = generate_correlated_feature_2(df,'Average_Bet_Amount','Number_of_Bonuses_Received',0.4,0.5,0.9)

In [327]:
df["Revenue_from_Bonuses"] = generate_correlated_feature_2(df,'Average_Bet_Amount','Amount_of_Bonuses_Received',0.4,0.5,0.9)

In [328]:
df["Increase_in_bets_after_bonus"] = generate_correlated_feature_2(df,'Revenue_from_Bonuses','Amount_of_Bonuses_Received',0.4,0.5,0.9)

In [329]:
df["Increase_in_wagering_after_bonus"] = generate_correlated_feature_2(df,'Increase_in_bets_after_bonus','Number_of_Bonuses_Received',0.4,0.5,0.9)

In [330]:
df["bonus_1"] = generate_correlated_feature_2(df,"income_level","Winning_percentage",.7,.6,.9)

In [241]:
df.columns

Index(['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender',
       'income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days', 'Total_Number_of_Bets', 'Total_Amount_Wagered',
       'Average_Bet_Amount', 'Number_of_Bonuses_Received',
       'Amount_of_Bonuses_Received', 'Revenue_from_Bonuses',
       'Increase_in_bets_after_bonus', 'Increase_in_wagering_after_bonus',
       'bonus_1'],
      dtype='object')

In [331]:
df["bonus_2"] = generate_correlated_feature_2(df,"bonus_1","Days_Since_Last_Bet",.7,.6,.9)

In [332]:
df["bonus_3"] = generate_correlated_feature_2(df,"bonus_2","Active_Days",.7,.8,.9)

In [333]:
df["bonus_4"] = generate_correlated_feature_2(df,"bonus_3","Total_Number_of_Bets",.7,.6,.9)

In [334]:
df["bonus_5"] = generate_correlated_feature_2(df,"bonus_4","Total_Amount_Wagered",.7,.8,.9)

In [335]:
df["bonus_6"] = generate_correlated_feature_2(df,"bonus_5","Average_Bet_Amount",.7,.8,.9)

In [336]:
df["bonus_7"] = generate_correlated_feature_2(df,"bonus_6","Number_of_Bonuses_Received",.7,.8,.9)

In [337]:
df["bonus_8"] = generate_correlated_feature_2(df,"bonus_7","Amount_of_Bonuses_Received",.7,.8,.9)

In [338]:
df["bonus_9"] = generate_correlated_feature_2(df,"bonus_8","Revenue_from_Bonuses",.7,.8,.9)

In [339]:
df["bonus_10"] = generate_correlated_feature_2(df,"bonus_9","Increase_in_bets_after_bonus",.7,.8,.9)

In [340]:
df["bonus_11"] = generate_correlated_feature_2(df,"bonus_10","Increase_in_wagering_after_bonus",.7,.8,.9)

In [341]:
df.columns

Index(['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender',
       'income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days', 'Total_Number_of_Bets', 'Total_Amount_Wagered',
       'Average_Bet_Amount', 'Number_of_Bonuses_Received',
       'Amount_of_Bonuses_Received', 'Revenue_from_Bonuses',
       'Increase_in_bets_after_bonus', 'Increase_in_wagering_after_bonus',
       'bonus_1', 'bonus_2', 'bonus_3', 'bonus_4', 'bonus_5', 'bonus_6',
       'bonus_7', 'bonus_8', 'bonus_9', 'bonus_10', 'bonus_11'],
      dtype='object')

In [342]:
df_2 = df[['income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days', 'Total_Number_of_Bets', 'Total_Amount_Wagered',
       'Average_Bet_Amount', 'Number_of_Bonuses_Received',
       'Amount_of_Bonuses_Received', 'Revenue_from_Bonuses',
       'Increase_in_bets_after_bonus', 'Increase_in_wagering_after_bonus',
       'bonus_1', 'bonus_2', 'bonus_3', 'bonus_4', 'bonus_5', 'bonus_6',
       'bonus_7', 'bonus_8', 'bonus_9', 'bonus_10', 'bonus_11']]

In [343]:
df_2.corr()

Unnamed: 0,income_level,Winning_percentage,Days_Since_Last_Bet,Active_Days,Total_Number_of_Bets,Total_Amount_Wagered,Average_Bet_Amount,Number_of_Bonuses_Received,Amount_of_Bonuses_Received,Revenue_from_Bonuses,...,bonus_2,bonus_3,bonus_4,bonus_5,bonus_6,bonus_7,bonus_8,bonus_9,bonus_10,bonus_11
income_level,1.0,-0.006477,0.008944,-0.005223,0.004382,0.009988,0.022416,-0.006134,0.003729,-0.038314,...,0.009429,0.005596,-0.003487,-0.004183,0.003642,0.001951,0.002652,-0.008461,0.000819,-0.001533
Winning_percentage,-0.006477,1.0,-0.005999,0.013035,0.008414,-0.025391,0.016,0.00458,-0.007019,0.030052,...,-0.00184,0.038102,0.001343,-0.019818,0.008291,0.010796,-0.02455,-0.002446,0.000883,0.01078
Days_Since_Last_Bet,0.008944,-0.005999,1.0,-0.001854,-0.009093,-0.001036,0.000646,0.000926,-0.000876,-0.004529,...,-0.007052,-0.005137,0.004986,0.010753,-0.006011,0.018146,0.017988,0.006943,0.017849,-0.002647
Active_Days,-0.005223,0.013035,-0.001854,1.0,0.006804,-0.023326,0.000296,0.003047,0.008585,-0.009072,...,-0.00221,0.021387,0.017985,0.010353,-0.029824,-0.004805,0.00476,-0.000541,0.001218,0.00428
Total_Number_of_Bets,0.004382,0.008414,-0.009093,0.006804,1.0,-0.021593,0.000808,-0.017762,0.002621,0.025977,...,-0.005076,-0.017395,-0.001482,0.011081,0.010632,0.01433,-0.016833,0.013372,0.012475,0.031712
Total_Amount_Wagered,0.009988,-0.025391,-0.001036,-0.023326,-0.021593,1.0,0.003263,0.004987,0.015856,-0.006305,...,0.001224,-0.005786,-0.006532,-0.003935,0.009125,-0.007814,0.008931,-0.01459,-0.001716,-0.00497
Average_Bet_Amount,0.022416,0.016,0.000646,0.000296,0.000808,0.003263,1.0,-0.016503,0.005827,0.015454,...,-0.008956,-0.003633,-0.026382,-0.02375,0.033536,0.007884,0.000593,0.00462,-0.023115,0.0057
Number_of_Bonuses_Received,-0.006134,0.00458,0.000926,0.003047,-0.017762,0.004987,-0.016503,1.0,0.013027,-0.008745,...,0.01018,0.001468,0.001073,0.000946,0.004224,-0.02154,0.025277,-0.010811,1.6e-05,-0.00251
Amount_of_Bonuses_Received,0.003729,-0.007019,-0.000876,0.008585,0.002621,0.015856,0.005827,0.013027,1.0,0.024405,...,0.00509,0.013123,0.002777,-0.012541,0.005397,0.011542,0.007587,0.003757,0.018099,-0.015324
Revenue_from_Bonuses,-0.038314,0.030052,-0.004529,-0.009072,0.025977,-0.006305,0.015454,-0.008745,0.024405,1.0,...,0.007384,-0.012001,0.020188,0.005574,-0.006436,0.012427,0.012329,0.000504,-0.000112,0.010392


In [260]:
df["Average_Bet_Amount"] = min_max_scaling(df["Average_Bet_Amount"], 10, 1000)
df["Average_Bet_Amount"] = round(df["Average_Bet_Amount"])

In [261]:
df["Total_Amount_Wagered"] = min_max_scaling(df["Total_Amount_Wagered"], 100, 100000)
df["Total_Amount_Wagered"] = round(df["Total_Amount_Wagered"])

In [262]:
df["Number_of_Bonuses_Received"] = min_max_scaling(df["Number_of_Bonuses_Received"], 1, 100)
df["Number_of_Bonuses_Received"] = round(df["Number_of_Bonuses_Received"])

In [263]:
df["Amount_of_Bonuses_Received"] = min_max_scaling(df["Amount_of_Bonuses_Received"], 10, 1000)
df["Amount_of_Bonuses_Received"] = round(df["Amount_of_Bonuses_Received"])

In [264]:
df["Revenue_from_Bonuses"] = min_max_scaling(df["Revenue_from_Bonuses"], 10, 5000)
df["Revenue_from_Bonuses"] = round(df["Revenue_from_Bonuses"])

In [265]:
df["Increase_in_bets_after_bonus"] = min_max_scaling(df["Increase_in_bets_after_bonus"], 30, 500)
df["Increase_in_bets_after_bonus"] = round(df["Increase_in_bets_after_bonus"])

In [266]:
df["Increase_in_wagering_after_bonus"] = min_max_scaling(df["Increase_in_wagering_after_bonus"], 100, 50000)
df["Increase_in_wagering_after_bonus"] = round(df["Increase_in_wagering_after_bonus"])

In [268]:
df["bonus_11"] = min_max_scaling(df["bonus_11"], 10, 500)
df["bonus_11"] = round(df["bonus_11"])

In [253]:
import numpy as np

def min_max_scaling(feature, new_min, new_max):
  min_val = np.min(feature)
  max_val = np.max(feature)
  scaled_feature = (feature - min_val) / (max_val - min_val) * (new_max - new_min) + new_min
  return scaled_feature



In [269]:
df["Total_Number_of_Bets"] = min_max_scaling(df["Total_Number_of_Bets"], 1, 50)
df["Total_Number_of_Bets"] = round(df["Total_Number_of_Bets"])

In [280]:
df["Should_Receive_Bonus"] = [1 if i > 263 else 0 for i in df["bonus_11"]]

In [281]:
df["Should_Receive_Bonus"].value_counts()

Should_Receive_Bonus
1    2874
0    2126
Name: count, dtype: int64

In [279]:
df["bonus_12"].value_counts()

bonus_12
1    2874
0    2126
Name: count, dtype: int64

In [282]:
df.head()

Unnamed: 0,customer_id,first_name,last_name,country,age,gender,income_level,Winning_percentage,Days_Since_Last_Bet,Active_Days,...,bonus_4,bonus_5,bonus_6,bonus_7,bonus_8,bonus_9,bonus_10,bonus_11,bonus_12,Should_Receive_Bonus
593,4368,Kevin,Carney,Portugal,34,Female,103037,48,8,52,...,114949.307582,64652.918969,53980.248687,115745.574792,7274.451711,127626.330806,81558.933952,271.0,1,1
2736,6093,Robert,Archer,Senegal,19,Female,104061,44,38,9,...,96801.465646,61013.142461,58664.189058,54306.458457,200832.548362,172364.258916,-50527.324489,259.0,0,0
1800,4970,John,Brooks,Guam,30,Male,125464,68,6,52,...,117202.753007,90638.520708,91604.834709,173577.028655,153145.678223,111177.037042,90262.972486,179.0,0,0
4313,9033,Laura,Jones,Saint Vincent and the Grenadines,35,Male,100702,59,13,333,...,129758.42452,79807.020131,176126.469995,7242.171362,136636.084807,61609.200967,59814.811206,221.0,0,0
2654,9144,Devin,Archer,Macao,21,Female,145579,47,36,9,...,29464.662088,-14439.523208,71564.798586,242973.488933,58941.887779,112287.459037,48980.326151,377.0,1,1


In [285]:
df.columns

Index(['customer_id', 'first_name', 'last_name', 'country', 'age', 'gender',
       'income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days', 'Total_Number_of_Bets', 'Total_Amount_Wagered',
       'Average_Bet_Amount', 'Number_of_Bonuses_Received',
       'Amount_of_Bonuses_Received', 'Revenue_from_Bonuses',
       'Increase_in_bets_after_bonus', 'Increase_in_wagering_after_bonus',
       'bonus_1', 'bonus_2', 'bonus_3', 'bonus_4', 'bonus_5', 'bonus_6',
       'bonus_7', 'bonus_8', 'bonus_9', 'bonus_10', 'bonus_11', 'bonus_12',
       'Should_Receive_Bonus'],
      dtype='object')

In [289]:
df_1 = df[['income_level', 'Winning_percentage', 'Days_Since_Last_Bet',
       'Active_Days', 'Total_Number_of_Bets', 'Total_Amount_Wagered',
       'Average_Bet_Amount', 'Number_of_Bonuses_Received',
       'Amount_of_Bonuses_Received', 'Revenue_from_Bonuses',
       'Increase_in_bets_after_bonus', 'Increase_in_wagering_after_bonus',"bonus_9",'Should_Receive_Bonus']]

In [290]:
df_1.corr()

Unnamed: 0,income_level,Winning_percentage,Days_Since_Last_Bet,Active_Days,Total_Number_of_Bets,Total_Amount_Wagered,Average_Bet_Amount,Number_of_Bonuses_Received,Amount_of_Bonuses_Received,Revenue_from_Bonuses,Increase_in_bets_after_bonus,Increase_in_wagering_after_bonus,bonus_9,Should_Receive_Bonus
income_level,1.0,-0.018131,0.005164,-0.004691,-0.015886,-0.002175,-0.015634,-0.004961,-0.019064,-0.025982,-0.013763,-0.017245,0.027458,0.009155
Winning_percentage,-0.018131,1.0,-0.004752,0.011731,0.000853,0.002476,0.020975,0.003078,-0.016736,-0.009707,-0.015868,-0.000798,0.038475,0.003728
Days_Since_Last_Bet,0.005164,-0.004752,1.0,0.005175,-0.016779,0.013049,0.018901,-0.013783,0.008406,0.017307,0.015064,-0.000581,0.005813,0.02553
Active_Days,-0.004691,0.011731,0.005175,1.0,0.023409,0.021611,0.006172,0.015508,0.006811,-0.001979,0.021937,0.011885,0.007175,0.012917
Total_Number_of_Bets,-0.015886,0.000853,-0.016779,0.023409,1.0,-0.014065,0.010291,0.011282,0.000598,-0.018547,0.011191,0.007211,0.002453,0.014331
Total_Amount_Wagered,-0.002175,0.002476,0.013049,0.021611,-0.014065,1.0,-0.013976,0.008699,-0.011438,-0.007429,-0.000485,-0.004079,-0.000744,0.002111
Average_Bet_Amount,-0.015634,0.020975,0.018901,0.006172,0.010291,-0.013976,1.0,0.003569,-0.015422,0.024526,0.01279,0.000744,-0.003061,-0.005346
Number_of_Bonuses_Received,-0.004961,0.003078,-0.013783,0.015508,0.011282,0.008699,0.003569,1.0,0.006618,0.009843,-0.009653,-0.002,-0.013708,-0.023959
Amount_of_Bonuses_Received,-0.019064,-0.016736,0.008406,0.006811,0.000598,-0.011438,-0.015422,0.006618,1.0,0.006226,-0.007658,-0.023519,0.018883,0.012763
Revenue_from_Bonuses,-0.025982,-0.009707,0.017307,-0.001979,-0.018547,-0.007429,0.024526,0.009843,0.006226,1.0,-0.000855,0.003909,-0.025052,0.022974


In [97]:
project_data_final = "project_data_final.csv"

In [99]:
df.to_csv(project_data_final,index=False)

In [98]:
df.head()

Unnamed: 0,Customer_id,First_name,Last_name,Country,Gender,Age,Winning_percentage,Days_Since_Last_Bet,Income_Level_(k),Total_Amount_wagered,...,Active_Days,Total_Number_of_Bets,Total_Amount_Wagered,Number_of_Bonuses_Received,Amount_of_Bonuses_Received,Revenue_from_Bonuses,Increase_in_bets_after_bonus,Increase_in_wagering_after_bonus,Should_Receive_Bonus,withdraw_frequency
0,606778,Tammy,Robinson,Mexico,Female,69,76,3,45124,4340,...,49,32.0,61621.0,54.0,719.0,4312.0,217.0,20241.0,0,Weekly
1,882088,Timothy,Smith,Argentina,Male,41,70,7,36256,30174,...,50,24.0,14323.0,39.0,402.0,4291.0,332.0,16955.0,0,Weekly
2,823104,Mary,Mckinney,Mexico,Female,51,20,5,31652,7685,...,54,20.0,58136.0,46.0,307.0,2858.0,281.0,25409.0,0,Weekly
3,243455,Sarah,White,India,Female,38,77,19,29271,844,...,7,32.0,38810.0,37.0,423.0,2534.0,273.0,21905.0,1,Monthly
4,485861,Edward,Stephenson,Brazil,Male,51,16,13,44007,3476,...,48,29.0,54909.0,48.0,460.0,2200.0,360.0,26404.0,1,Weekly
