### Predict overall risk score

In [11]:
# Import libraries
import pandas as pd
from sklearn.linear_model import BayesianRidge, LogisticRegression
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, matthews_corrcoef
from sklearn.naive_bayes import GaussianNB
from sklearn.inspection import permutation_importance

In [2]:
# Read data
data = pd.read_csv("final_data_wide.csv", index_col = 0 )

In [49]:
# Only gamblers with full data
data_noNA = data.dropna()
y= data_noNA["risk_class"]
x = data_noNA.iloc[:,3:23]
y.value_counts()

High risk        136
Non high risk     67
Name: risk_class, dtype: int64

In [46]:
# Only take the last 3 months
data_last3months = data[data['6Ahab_s'].notna()]
y= data_last3months["risk_class"]
x = data_last3months.iloc[:,18:23]
y.value_counts()

High risk        217
Non high risk    174
Name: risk_class, dtype: int64

In [10]:
# Random Forrest classifier
clf_forest = RandomForestClassifier()
print(cross_val_score(clf_forest, x, y, cv = 10, scoring='balanced_accuracy'))
print(sum(cross_val_score(clf_forest, x, y, cv = 10, scoring='balanced_accuracy'))/10)

[0.66414141 0.62834225 0.61229947 0.5013369  0.56951872 0.69385027
 0.57352941 0.62698413 0.5        0.56746032]
0.6136066547831254


In [15]:
# Gaussian Naive Bayes classifier
naivebayes  = GaussianNB()
print(cross_val_score(naivebayes, x, y, cv = 10, scoring='balanced_accuracy'))
print(sum(cross_val_score(naivebayes, x, y, cv = 10, scoring='balanced_accuracy'))/10)

[0.78282828 0.52406417 0.59224599 0.5855615  0.5040107  0.64438503
 0.56951872 0.53174603 0.57936508 0.5952381 ]
0.5908963585434173


In [16]:
# Logistic regression
log_reg = LogisticRegression()
print(cross_val_score(log_reg, x, y, cv = 10, scoring='balanced_accuracy'))
print(sum(cross_val_score(log_reg, x, y, cv = 10, scoring='balanced_accuracy'))/10)


[0.73232323 0.54679144 0.61497326 0.53342246 0.45855615 0.68983957
 0.59224599 0.50396825 0.57936508 0.57142857]
0.5822914014090484


In [17]:
# Suport vector classifier
svc = svm.SVC()
print(cross_val_score(svc, x, y, cv = 10, scoring='balanced_accuracy'))
print(sum(cross_val_score(svc, x, y, cv = 10, scoring='balanced_accuracy'))/10)

[0.73232323 0.52406417 0.56283422 0.55614973 0.54010695 0.63770053
 0.64438503 0.52380952 0.5515873  0.56746032]
0.5840421016891605


In [51]:
clf_predictions = cross_val_predict(clf_forest, x, y, cv = 10)
naivebayes_predictions = cross_val_predict(naivebayes, x, y, cv = 10)
log_prediction = cross_val_predict(log_reg, x, y, cv = 10)
svm_predictions = cross_val_predict(svc, x, y, cv = 10)

In [47]:
# Phi coefficient
print(matthews_corrcoef(y, clf_predictions))
print(matthews_corrcoef(y, naivebayes_predictions))
print(matthews_corrcoef(y, log_prediction))
print(matthews_corrcoef(y, svm_predictions))

0.21842785544339377
0.19669644592632904
0.1794373570092452
0.1842165286706904


### Predict specific risk factors 

In [54]:
# Load risk factors
expert_labels = pd.read_csv("Data/Thesis_expert_ratings.csv", index_col = 0 )

In [55]:
# Merge with data 
all_dat = pd.merge(data, expert_labels, left_on='ID', right_on= "PlayerID")

In [56]:
# Only take the last 3 months
alldat_last3months = all_dat[all_dat['6Ahab_s'].notna()]
x = alldat_last3months.iloc[:,18:23]

In [14]:
# Calculate balanced accuracy
clf_forest = RandomForestClassifier()

for i in range(25, 39):
    y = alldat_last3months.iloc[:,i]
    print(alldat_last3months.columns[i])
    res_list=cross_val_score(clf_forest, x, y, cv = 10, scoring='balanced_accuracy')
    print(res_list)
    print(sum(res_list)/10)

Loss chasing
[0.48290598 0.56944444 0.59259259 0.59615385 0.5        0.46153846
 0.57692308 0.57692308 0.65384615 0.40384615]
0.541417378917379
Fluctuating wagers
[0.48290598 0.77777778 0.67592593 0.61574074 0.71759259 0.51388889
 0.63425926 0.70833333 0.71296296 0.69230769]
0.6531695156695156
Loss of control
[0.56666667 0.6        0.53103448 0.49827586 0.43103448 0.58275862
 0.49827586 0.46551724 0.47777778 0.52777778]
0.5179118773946361
Number of days played
[0.5        0.5        0.5        0.5        0.5        0.48648649
 0.5        0.48611111 0.5        0.5       ]
0.4972597597597598
Overall consumption
[0.51666667 0.56551724 0.51551724 0.46551724 0.46551724 0.48275862
 0.49827586 0.56551724 0.53103448 0.49827586]
0.5104597701149426
Time of day
[0.49333333 0.59857143 0.55416667 0.425      0.33333333 0.5375
 0.61666667 0.57083333 0.52083333 0.61666667]
0.5266904761904762
Time consumption
[0.53535354 0.54679144 0.44919786 0.58730159 0.55555556 0.51190476
 0.50793651 0.68253968 0.55

In [59]:
# Calculate phi coefficient

clf_forest = RandomForestClassifier()

for i in range(25, 39):
    y = alldat_last3months.iloc[:,i]
    print(alldat_last3months.columns[i])
    preds=cross_val_predict(clf_forest, x, y, cv = 10)
    print(matthews_corrcoef(y, preds))

Loss chasing
0.07634938539044127
Fluctuating wagers
0.3111752786887468
Loss of control
0.06671960795565031
Number of days played
-0.022981280637945606
Overall consumption
-0.0329733641411196
Time of day
0.10636539385830512
Time consumption
0.09239188325199153
Repeated loss of winnings
0.03355406133735526
Tolerance
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
0.0
Number of deposits
0.1890890402328506
Cancelled withdrawals
0.10643331449149458
Repeated deposits on loss
0.12482678195312291
One day problem gambler
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
0.0
Runs of gambling
-0.02682339566637788
