In [19]:
import os
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.utils import class_weight

import warnings

In [20]:
class TruthfulnessCostSensitiveEstimator:

    def __init__(self):
        print('Starting Truthfulness Estimator COST-SENSITIVE Version 1.3')
        self.user_responses = self.load_survey_data()
        
        self.user_responses.drop(columns=['Prolific ID'], inplace=True)
        
        self.reordered_user_responses = self.customise_data()

        self.transformed_user_responses = self.discrete_transform()

        # self.classification('binary')
        self.classification('multi-class')
        
    def load_survey_data(self):
        df = pd.read_csv('All_Responses_Removed.csv')

        return df

    
    def customise_data(self):
        data = self.user_responses
        data.columns = data.columns.str.replace(r'[\s\n\t ]+', '-')
        data.columns = data.columns.str.replace(r'[a-d]-', '-')

        demographics_data = data.iloc[:, :8]
        demographics_data = demographics_data.reindex(sorted(demographics_data.columns), axis=1)
        question_data = data.reindex(sorted(data.columns[8:]), axis=1)
        reordered_user_responses = pd.concat([demographics_data, question_data], axis=1)

        return reordered_user_responses
    

    def discrete_transform(self):
        data = self.reordered_user_responses
        data.loc[data['Age'] <= 17, 'Age'] = 0
        data.loc[(data['Age'] > 17) & (data['Age'] <= 24), 'Age'] = 1
        data.loc[(data['Age'] > 24) & (data['Age'] <= 34), 'Age'] = 2
        data.loc[(data['Age'] > 34) & (data['Age'] <= 44), 'Age'] = 3
        data.loc[(data['Age'] > 44) & (data['Age'] <= 54), 'Age'] = 4
        data.loc[(data['Age'] > 54) & (data['Age'] <= 64), 'Age'] = 5
        data.loc[data['Age'] > 64, 'Age'] = 6

        data.loc[data['Online-Presence'] <= 5, 'Online-Presence'] = 0
        data.loc[(data['Online-Presence'] > 5) &
                (data['Online-Presence'] <= 10), 'Online-Presence'] = 1
        data.loc[(data['Online-Presence'] > 10) &
                (data['Online-Presence'] <= 15), 'Online-Presence'] = 2
        data.loc[(data['Online-Presence'] > 15) &
                (data['Online-Presence'] <= 20), 'Online-Presence'] = 3
        data.loc[(data['Online-Presence'] > 20) &
                (data['Online-Presence'] <= 25), 'Online-Presence'] = 4

        return data


    def impute_data(self, data):
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        imputed_data = pd.DataFrame(imp.fit_transform(
            data), columns=data.columns, index=data.index)

        return imputed_data


    def data_transformation(self, data):
        scaler = StandardScaler()
        standard_data = pd.DataFrame(scaler.fit_transform(
            data), columns=data.columns, index=data.index)

        transformer = PowerTransformer()
        transformed_data = pd.DataFrame(transformer.fit_transform(
            standard_data), columns=data.columns, index=data.index)

        return scaler, transformer, transformed_data


    def data_inverse_transformation(self, scaler_object, transformer_object, data):
        inverse_transformed_data = pd.DataFrame(transformer_object.inverse_transform(data),
                                                columns=data.columns, index=data.index)
        inverse_scaled_data = pd.DataFrame(scaler_object.inverse_transform(inverse_transformed_data),
                                        columns=data.columns, index=data.index)

        return inverse_scaled_data
    
    def classification(self, clf_type):
        class_weights = {}
        class_count = {}
        class_count_ratio = {}
        important_features = {}
        scale_pos_weight = 1

        data = self.transformed_user_responses
        if clf_type == 'binary':
            data.iloc[:, 10:207:4] = (data.iloc[:, 10:207:4] == 7.0)
            type(data.iloc)
        if clf_type == 'multi-class':
            data.iloc[:, 10:207:4] = data.iloc[:, 10:207:4].replace([1.0, 2.0, 3.0], np.float64(0))
            data.iloc[:, 10:207:4] = data.iloc[:, 10:207:4].replace([4.0, 5.0, 6.0], np.float64(1))
            data.iloc[:, 10:207:4] = data.iloc[:, 10:207:4].replace([7.0], np.float64(2))

        question_estimator = {}

        relevant_indexes = []
        demographics_column_indexes = ['Age', 'Gender', 'IUIPC-Awareness', 'IUIPC-Collection', 'IUIPC-Control',
                                    'Online-Presence', 'Personal-Stability', 'Reciprocity']
        relevant_indexes.extend(demographics_column_indexes)
        
        for question_number in range(1, 51):
#             print("Question No. is: ", question_number)
            question = 'Q' + str(question_number).zfill(2)
            
            question_indexes = []
            question_indexes.extend([str(question_number).zfill(2) + '-Effort',
                            str(question_number).zfill(2) + '-Relevance',
                            str(question_number).zfill(2) + '-Uncomfortable',
                            str(question_number).zfill(2) + '-Truthfulness'])
            relevant_indexes.extend(question_indexes)

            question_label = str(question_number).zfill(2) + '-Truthfulness'

            if clf_type == 'binary':
                train_data_question, test_data_question = train_test_split(data[relevant_indexes], 
                stratify=data[question_label], test_size=0.3, random_state=42)
            if clf_type == 'multi-class':
                cleaned_user_responses = data.loc[:, relevant_indexes].dropna()

                train_data_question, test_data_question = train_test_split(cleaned_user_responses,
                                                                        stratify=cleaned_user_responses[question_label],
                                                                        test_size=0.3, random_state=42)

            train_x_question = train_data_question.copy()
            if clf_type == 'binary':
                train_x_question = self.impute_data(train_x_question)

                computed_weights = class_weight.compute_class_weight('balanced', np.unique(train_x_question[question_label]), train_x_question[question_label]).tolist()
                class_weights[question] = {0:computed_weights[0], 1:computed_weights[1]}
                
            if clf_type == 'multi-class':
                temp_df = pd.Series(train_x_question[relevant_indexes].groupby
                            (by=question_label).size())
                class_count[question] = [temp_df.values[0], temp_df.values[1], temp_df.values[2]]
                class_count_ratio[question] = [
                    round((temp_df.values[0] / len(train_data_question)) * 100, 2),
                    round((temp_df.values[1] / len(train_data_question)) * 100, 2),
                    round((temp_df.values[2] / len(train_data_question)) * 100, 2)]

                computed_weights = class_weight.compute_class_weight('balanced', np.unique(train_x_question[question_label]), train_x_question[question_label]).tolist()
                class_weights[question] = [computed_weights[0], computed_weights[1], computed_weights[2]]
                

#             train_y_question = train_x_question.loc[:, question_label]
#             train_x_question.drop(columns=question_label, inplace=True)

#             train_scaler_question, train_transformer_question, transformed_train_x_question = \
#                 self.data_transformation(train_x_question)

#             rf = RandomForestClassifier(n_estimators=300, random_state=42)
#             rf.fit(transformed_train_x_question, train_y_question)

#             importances = rf.feature_importances_
#             std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
#             indices = np.argsort(importances)[::-1]

#             important_features[question] = indices[0:3].tolist()

#             featured_train_data = transformed_train_x_question.iloc[:, important_features[question]]
#             try:
#                 best_estimator, best_estimator_params = self.find_clf_parameters(featured_train_data, 
#                 train_y_question, class_weights[question], clf_type)
#             except ValueError as e:
#                 print("Value Error: " + str(e))
#                 print('Record skipped.')
#                 continue  

#             question_estimator[question] = [best_estimator, best_estimator_params]

            del relevant_indexes[8:]
#         print(pd.DataFrame.from_dict(class_count_ratio, orient='index', columns=['Class-0', 'Class-1', 'Class-2']))
        print(pd.DataFrame.from_dict(class_count, orient='index', columns=['Class-0', 'Class-1', 'Class-2']))
        print(pd.DataFrame.from_dict(class_weights, orient='index', columns=['Weight-0', 'Weight-1', 'Weight-2']))
#         if not os.path.isdir(RESULTS_DIR):
#             os.makedirs(RESULTS_DIR)

#         filename = os.path.join(
#             RESULTS_DIR, clf_type + '_cost_sensitive_estimator_parameters_1.3.txt')

#         with open(filename, 'w') as f:
#             print(question_estimator, file=f)   

In [21]:
cost_sensitive_estimator = TruthfulnessCostSensitiveEstimator()

Starting Truthfulness Estimator COST-SENSITIVE Version 1.3
     Class-0  Class-1  Class-2
Q01    23.50    17.05    59.45
Q02     1.23     2.88    95.88
Q03    15.06    10.88    74.06
Q04    67.65     8.82    23.53
Q05     2.37     6.16    91.47
Q06     6.01    13.30    80.69
Q07    22.22    15.74    62.04
Q08    45.19    20.08    34.73
Q09    16.44    17.81    65.75
Q10    23.31    16.53    60.17
Q11    33.64    18.22    48.13
Q12    66.10    11.86    22.03
Q13    41.20    17.60    41.20
Q14    46.41    15.19    38.40
Q15    54.42     9.77    35.81
Q16     1.21     5.26    93.52
Q17     1.72    24.46    73.82
Q18     1.67    10.46    87.87
Q19     3.35     7.95    88.70
Q20     1.84     8.29    89.86
Q21     3.60    17.57    78.83
Q22     6.61    29.75    63.64
Q23     1.38    10.09    88.53
Q24     7.92    19.17    72.92
Q25     1.68    11.34    86.97
Q26    14.35    35.19    50.46
Q27     2.93    20.50    76.57
Q28     0.41    10.33    89.26
Q29     2.78    15.28    81.94
Q30    33.0