In [1]:
import sys
sys.path.append("../")


In [30]:
log_path = "../log_files/"
files_extension = "*.csv"

In [33]:
from os import mkdir
from glob import glob
from pandas import DataFrame
from datetime import datetime
from volunteer_report import VolunteerResultsReport

class DataAnalysis():
    
    def __init__(self, log_path, save_dir="datasets/", find_string="_user", log_file_extension="*.csv"):
        self.log_path = log_path
        self.save_dir = save_dir
        self.save_path = self.log_path + self.save_dir
        self.find_string = find_string
        self.log_file_extension = log_file_extension
        self.volunteers_log_file_list = self.get_volunteers_log_file_list()
        self.volunteer_id_list = self.get_volunteers_id_list()
        self.volunteers_data = self.get_volunteers_data()
        self.last_volunteers_file_name_csv = None
        self.last_volunteers_file_name_excel = None
        self.save_path_already_existed = self.create_save_dir()
        
    def create_save_dir(self):
        save_path_already_existed = False
        try:
            mkdir(path=self.save_path)
        except FileExistsError as path_already_existed_error:
            save_path_already_existed = True
            print(path_already_existed_error)
            print("The save_path directory already exists. No need to creat it.")
        except OSError as error: 
            print(error)
        
        return save_path_already_existed
    
    def get_volunteers_log_file_list(self):
        search_pattern = self.log_path + self.log_file_extension
        volunteers_log_file_list = glob(search_pattern)
        volunteers_log_file_list.sort()
        return volunteers_log_file_list
    
    def get_volunteers_id_list(self):
        volunteer_id_list = []
        for file_name in self.volunteers_log_file_list:
            position = file_name.find(self.find_string)
            volunteer_id = int(file_name[position-4:position])
            volunteer_id_list.append(volunteer_id)
        return volunteer_id_list
    
    def get_volunteer_time_record(self, volunteer_id):
        volunteer_report = VolunteerResultsReport(volunteer_id, self.log_path)
        time_record = {"volunteer_id": volunteer_id}
        time_record = {**time_record, **volunteer_report.duration_dict}
        return time_record
    
    def get_volunteer_overall_result_record(self, volunteer_id):
        volunteer_report = VolunteerResultsReport(volunteer_id, self.log_path)
        record = {"volunteer_id": volunteer_id}
        record = {**record, **volunteer_report.overall_result_dict}
        return record
        
    def get_volunteer_result_per_class_record(self, volunteer_id):
        volunteer_report = VolunteerResultsReport(volunteer_id, self.log_path)
        result_per_class_records = volunteer_report.result_per_class_records
        dict_list = []
        for record in result_per_class_records:
            image_class_column_name = record["image_class"]
            right_answer_column_name = image_class_column_name+"_"+"questions_right_answer_count"
            total_count_column_name = image_class_column_name+"_"+"questions_total_count"
            right_perc_column_name = image_class_column_name+"_"+"questions_right_percentage"
            volunteer_result_per_class_record = {
                right_answer_column_name: record["is_right_answer"],
                total_count_column_name: record["count"],
                right_perc_column_name: record["right_perc"]
            }
            dict_list.append(volunteer_result_per_class_record)
        
        volunteer_result_per_class_record = {"volunteer_id": volunteer_id}
        for record in dict_list:
            volunteer_result_per_class_record = {**volunteer_result_per_class_record, **record}
        
        return volunteer_result_per_class_record
    
    def get_volunteer_record(self, volunteer_id):
        
        record_list = []
        
        record_list.append(self.get_volunteer_overall_result_record(volunteer_id))
        record_list.append(self.get_volunteer_time_record(volunteer_id))
        record_list.append(self.get_volunteer_result_per_class_record(volunteer_id))
        
        volunteer_record = {"volunteer_id": volunteer_id}
        for record in record_list:
            volunteer_record.update(record)
        
        return volunteer_record
    
    def get_volunteers_data(self):
        
        record_list = []
        for volunteer_id in self.volunteer_id_list:
            record_list.append(self.get_volunteer_record(volunteer_id))
        
        volunteers_data = DataFrame.from_dict(data=record_list)
        
        return volunteers_data
    
    def get_volunteers_data_file_name(
            self, 
            file_format="excel",
            file_path=None,
            file_name = "volunteers_data",
            suffix_time_format = "%Y_%m_%d_%H_%M_%S"
        ):
        
        file_format_extension_map = {
            "excel":".xlsx",
            "csv":".csv"
        }
        
        if file_format == None:
            file_format = "excel"
        
        if file_path == None:
            file_path = self.save_path
           
        if file_name == None:
            file_name = "volunteers_data"
        
        if suffix_time_format == None:
            suffix_time_format = "%Y_%m_%d_%H_%M_%S"
        
        now = datetime.now()
        suffix = now.strftime(suffix_time_format)
        extension = file_format_extension_map[file_format]
        
        full_file_name = (
            file_path
            +file_name
            +"_"
            +suffix
            +extension
        )
        
        return full_file_name
    
    def save_volunteers_csv(
            self, 
            file_path = None, 
            file_name = None,
            suffix_time_format = None    
        ):
         
        csv_file_name = self.get_volunteers_data_file_name(
            file_format="csv",
            file_path=file_path, 
            file_name=file_name,
            suffix_time_format=suffix_time_format
        )
        
        self.volunteers_data.to_csv(csv_file_name, index_label="dataset_index")
        
        self.last_volunteers_file_name_csv = csv_file_name
        
        return csv_file_name
        
    def save_volunteers_excel(
        self, 
        file_path = None, 
        file_name = None,
        suffix_time_format = None    
        ):
        
        excel_file_name = self.get_volunteers_data_file_name(
            file_format="excel",
            file_path=file_path, 
            file_name=file_name,
            suffix_time_format=suffix_time_format
        )
        
        self.volunteers_data.to_excel(excel_file_name, index_label="dataset_index")
        
        self.last_volunteers_file_name_excel = excel_file_name
        
        return excel_file_name 

In [34]:
da = DataAnalysis(log_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_answers['is_right_answer'] = mask_right.to_numpy()


[Errno 17] File exists: '../log_files/datasets/'
The save_path directory already exists. No need to creat it.


In [26]:
da.save_path

'../log_files/datasets/'

In [29]:
da.save_path+"adriano"

'../log_files/datasets/adriano'

In [28]:
os.path.isdir(da.save_path+"adriano")

False

In [20]:
da.get_volunteers_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_answers['is_right_answer'] = mask_right.to_numpy()


Unnamed: 0,volunteer_id,images_count,right_answers_count,wrong_answers_count,overall_result_ratio,overall_result_percentage,timestamp_begin,timestamp_end,timestamp_duration,days,...,catch_stimuli_questions_right_percentage,negative_questions_right_answer_count,negative_questions_total_count,negative_questions_right_percentage,neutral_questions_right_answer_count,neutral_questions_total_count,neutral_questions_right_percentage,positive_questions_right_answer_count,positive_questions_total_count,positive_questions_right_percentage
0,1,11,8,3,0.727273,72.727273,2021-10-11 21:18:14.375571,2021-10-11 21:21:52.012878,0 days 00:03:37.637307,0,...,100.0,1,2,50.0,1,2,50.0,1,2,50.0
1,2,11,7,4,0.636364,63.636364,2021-10-11 21:27:53.624455,2021-10-11 21:34:01.558865,0 days 00:06:07.934410,0,...,100.0,1,2,50.0,0,2,0.0,1,2,50.0
2,3,11,9,2,0.818182,81.818182,2021-10-11 21:59:05.144515,2021-10-11 21:59:43.635730,0 days 00:00:38.491215,0,...,100.0,2,2,100.0,0,2,0.0,2,2,100.0
3,4,11,11,0,1.0,100.0,2021-10-11 22:00:42.498477,2021-10-11 22:01:22.825477,0 days 00:00:40.327000,0,...,100.0,2,2,100.0,2,2,100.0,2,2,100.0
4,6,11,11,0,1.0,100.0,2021-10-11 22:17:32.165523,2021-10-11 22:18:14.263110,0 days 00:00:42.097587,0,...,100.0,2,2,100.0,2,2,100.0,2,2,100.0
5,7,11,7,4,0.636364,63.636364,2021-10-11 22:39:57.451973,2021-10-11 22:47:30.953750,0 days 00:07:33.501777,0,...,100.0,0,2,0.0,2,2,100.0,0,2,0.0
6,8,11,5,6,0.454545,45.454545,2021-10-11 22:57:59.219116,2021-10-11 23:01:09.496942,0 days 00:03:10.277826,0,...,100.0,0,2,0.0,0,2,0.0,0,2,0.0
7,12,22,9,13,0.409091,40.909091,2021-10-12 01:01:56.475574,2021-10-12 01:02:30.559938,0 days 00:00:34.084364,0,...,30.0,2,4,50.0,2,4,50.0,2,4,50.0
8,13,11,8,3,0.727273,72.727273,2021-10-12 01:15:01.004832,2021-10-12 01:15:39.392578,0 days 00:00:38.387746,0,...,100.0,1,2,50.0,1,2,50.0,1,2,50.0
9,14,11,10,1,0.909091,90.909091,2021-10-12 01:24:11.450376,2021-10-12 01:24:49.892329,0 days 00:00:38.441953,0,...,100.0,1,2,50.0,2,2,100.0,2,2,100.0
