In [182]:
import pandas as pd
import numpy as np
from os import listdir
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_fscore_support
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [17]:
rootPath = '../../DATASET/RESULTS/features/clustering/'
game_types = ['single/', 'merge/']
days = ['day1/', 'day2/', 'day3/']

In [79]:
clsesHead = ["cls_gender", "cls_age2", "cls_age3", "cls_edu_highschool", "cls_edu_bachelor",
                 "cls_edu_graduate", "cls_edu_all", "cls_prof_cs", "cls_prof_business", "cls_prof_all"]

featuresHead = ["time", "time_first_touch", "time_first_action", "time_between_touches_mean", "time_between_touches_std",
     "time_between_touches_min", "time_between_touches_max", "speed_touch_mean", "speed_touch_std", "speed_touch_min",
     "speed_touch_max", "acc_touch_mean", "acc_touch_std", "acc_touch_min", "acc_touch_max", "real_distance_to_distance_mean",
     "real_distance_to_distance_std", "real_distance_to_distance_min", "real_distance_to_distance_max",
     "speed_move_mean", "speed_move_std", "speed_move_min", "speed_move_max", "acc_move_mean", "acc_move_std",
     "acc_move_min",	"acc_move_max", "move_real_distance_to_distance_mean", "move_real_distance_to_distance_std",
     "move_real_distance_to_distance_min", "move_real_distance_to_distance_max", "distance_click_object_center_mean",
     "distance_click_object_center_std", "distance_click_object_center_min", "distance_click_object_center_max",
     "distance_drop_target_center_mean", "distance_drop_target_center_std", "distance_drop_target_center_min",
     "distance_drop_target_center_max", "total_distance", "dif_time_stamp_mean", "dif_time_stamp_std",
     "dif_time_stamp_min", "dif_time_stamp_max", "move_silence_mean", "move_silence_std", "move_silence_min",
     "move_silence_max", "drag_silence_mean", "drag_silence_std", "drag_silence_min", "drag_silence_max",
     "pause_and_drag_mean", "pause_and_drag_std", "pause_and_drag_min", "pause_and_drag_max", "pause_and_drop_mean",
     "pause_and_drop_std", "pause_and_drop_min", "pause_and_drop_max", "angles_mean", "angles_std", "angles_min", "angles_max"]


        

In [184]:
# listing the users from day 1
daySelect = days[0]
game = game_types[1]

working_dir = rootPath + daySelect + game
files_day1 = set([f for f in listdir(working_dir)])

In [193]:
def prepareFeatures(inRoot, set_of_files):
    features = pd.DataFrame(columns=featuresHead)
    lbls = pd.DataFrame(columns=clsesHead)
    
    cnt = 0
    
    for f in set_of_files:
        cnt = cnt + 1
        cur_file = inRoot + f
        temp_df = pd.read_csv(cur_file)
        
        # select lbls only
        temp_lbls = temp_df.loc[:, list(clsesHead)]

        # dropping columns with classes
        temp_df.drop(clsesHead, axis=1, inplace=True)

        features = features.append(temp_df)
        lbls = lbls.append(temp_lbls)
#         print(features.shape)
#         print(lbls.shape)
#         print()
    
    return features, lbls

In [194]:
def predict(X_train, X_test, y_train, y_test):
    # Re-sampling training data
    sm = SMOTE(random_state=42)
    print('original dataset %s' %Counter(y_train))
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    print('resampled dataset %s' %Counter(y_train_res))
    y_train_res = y_train_res.flatten()
    
    # Feature normalization/scaling
    sc = StandardScaler()
    sc.fit(X_train_res)
    train_norm = sc.transform(X_train_res)
    test_norm = sc.transform(X_test)
    
    # dimensionality reduction
    optimal_dimen = 35
    pca = PCA(n_components = optimal_dimen)
    pca.fit(train_norm)
    train = pca.transform(train_norm)
    test = pca.transform(test_norm)
    
    # build, fit and test the model
#     rf = RandomForestClassifier(n_estimators = 400,
#                                 max_depth = 40,
#                                 max_features = 'log2',
#                                 max_leaf_nodes = 170,
#                                 min_samples_split = 0.30,
#                                 bootstrap = False)
    rf = SVC(C = 1, kernel = 'linear', probability = True)
    rf.fit(train, y_train_res)
    rf_predict = rf.predict(test)
    return rf_predict
    

In [195]:
# listing users from day 2
daySelect = days[1]
game = game_types[1]

working_dir = rootPath + daySelect + game
print(working_dir)
files_day2 = set([f for f in listdir(working_dir)])
print('Day 2 Size: %d' %len(files_day2))
print('Before removal of file. Day 1 size: %d' % len(files_day1))
print()



../../DATASET/RESULTS/features/clustering/day2/merge/
Day 2 Size: 62
Before removal of file. Day 1 size: 104



In [196]:
y_true = np.empty((0, 1), int)
y_pred = np.empty((0, 1), int)

cnt = 1;
print('after removal')
for f in files_day2:
    if f in files_day1:
        print()
        print(working_dir + f)
        test_features, test_lbls = prepareFeatures(working_dir, {f}) 
        
        files_day1.remove(f)
        print('%d. day 1 size: %d' % (cnt, len(files_day1)))
        
        train_features, train_lbls = prepareFeatures(rootPath + days[0] + game, files_day1)
        
        temp_y = test_lbls.cls_age2.values
        temp_pred = predictUsingRF(train_features, test_features, train_lbls.cls_age2.values, test_lbls.cls_age2.values)
        
        
        y_true = np.append(y_true, temp_y)
        y_pred = np.append(y_pred, temp_pred)
        
        print('Test Dataset %s' %Counter(y_true))
    
        cnt = cnt + 1
        files_day1.add(f)
    else:
        print('Not in day 1: ' + f)

after removal

../../DATASET/RESULTS/features/clustering/day2/merge/A2EOOF9D135HQ1.csv
1. day 1 size: 103
original dataset Counter({1.0: 1727, 2.0: 1061})
resampled dataset Counter({2.0: 1727, 1.0: 1727})
Test Dataset Counter({2.0: 18})

../../DATASET/RESULTS/features/clustering/day2/merge/A2FIANUBDARA16.csv
2. day 1 size: 103
original dataset Counter({1.0: 1727, 2.0: 1085})
resampled dataset Counter({2.0: 1727, 1.0: 1727})
Test Dataset Counter({2.0: 20})

../../DATASET/RESULTS/features/clustering/day2/merge/A21HSQ4CW72MNM.csv
3. day 1 size: 103
original dataset Counter({1.0: 1727, 2.0: 1061})
resampled dataset Counter({2.0: 1727, 1.0: 1727})
Test Dataset Counter({2.0: 38})

../../DATASET/RESULTS/features/clustering/day2/merge/A3AA5G6HENO6VJ.csv
4. day 1 size: 103
original dataset Counter({1.0: 1699, 2.0: 1091})
resampled dataset Counter({2.0: 1699, 1.0: 1699})
Test Dataset Counter({2.0: 38, 1.0: 16})

../../DATASET/RESULTS/features/clustering/day2/merge/A26RO8GGTQAXGG.csv
5. day 1 siz

In [192]:
print(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
confusion_matrix(y_true, y_pred)


(0.5389712154603039, 0.5367114788004137, 0.5377863098311711, None)


array([[363, 229],
       [219, 156]], dtype=int64)

In [None]:
df = pd.read_csv('../../DATASET/features_all.csv')
columnHeads = list(df.columns.values)
featureHeads = list(df.columns.values[1:64])
clsHeads = list(df.columns.values[64:])

X_raw = df.loc[:, columnHeads]
y = df.cls_gender.values
y[y == 2] = 0

In [6]:
sm = SMOTE(random_state=42)
print('original dataset %s' %Counter(y))
X_res, y_res = sm.fit_sample(X_raw, y)
print('resampled dataset %s' %Counter(y_res))
y_res = y_res.flatten()

original dataset Counter({1: 5054, 0: 3566})
resampled dataset Counter({0: 5054, 1: 5054})


In [7]:
train_raw, test_raw, train_labels, test_labels = train_test_split(X_res, y_res, 
                                                          stratify = y_res,
                                                          test_size = 0.3, 
                                                          random_state = 42)

In [8]:
sc = StandardScaler()
sc.fit(train_raw)
train = sc.transform(train_raw)
test = sc.transform(test_raw)

In [9]:
# Set the parameters by cross-validation
param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
 {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]


In [13]:
score = 'roc_auc'
print("# Tuning hyper-parameters for %s" % score)
print()

clf = GridSearchCV(SVC(probability = True), param_grid, cv=10,
                   scoring='%s' % score)
clf.fit(train, train_labels)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()


# Tuning hyper-parameters for roc_auc

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

1.000 (+/-0.000) for {'C': 1, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 10, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 100, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 1000, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evalu

In [11]:
y_true, y_pred = test_labels, clf.predict(test)
print(classification_report(y_true, y_pred))
print()

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1517
          1       1.00      1.00      1.00      1516

avg / total       1.00      1.00      1.00      3033


