In [30]:
import os
import sys
import collections
import pickle
import json

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from scipy import stats
import statistics

In [31]:
sensory_words_traindf = pd.DataFrame()
sensory_words_testdf = pd.DataFrame()
words_embedding_dict = {}

In [32]:
def get_assigned_words(seq_clusters, cluster_words, axis, flag_train=False):

    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(
        seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
        assigned_words = np.where(
            seq_clusters != idx, assigned_words, cluster_words[idx])

    if flag_train:
        sensory_words_traindf[axis] = assigned_words
        assigned_clusterWord = pd.DataFrame(
            data=assigned_words, columns=['cluster_word'])
        return assigned_clusterWord
    else:
        sensory_words_testdf[axis] = assigned_words


def clustering(statistic_train_df, statistic_test_df, axis, cluster_cnts, cluster_words):

    statistic_train_df = Normalizer().fit_transform(np.array(statistic_train_df))
    statistic_test_df = Normalizer().fit_transform(np.array(statistic_test_df))

    model = KMeans(n_clusters=cluster_cnts,
                   random_state=234).fit(statistic_train_df)

    cluster_ids = pd.DataFrame(model.predict(
        statistic_train_df), columns=['cluster ID'])
    cluster_test_ids = pd.DataFrame(model.predict(
        statistic_test_df), columns=['cluster ID'])

    seq_clusters = cluster_ids.to_numpy()
    assigned_clusterWord = get_assigned_words(
        seq_clusters, cluster_words, axis, flag_train=True)
    get_assigned_words(cluster_test_ids.to_numpy(), cluster_words, axis)

    centroids_of_clusters = pd.DataFrame(model.cluster_centers_[cluster_ids['cluster ID']],
                                         columns=[f'dim_{val}' for val in range(statistic_train_df.shape[1])])
    result = pd.concat([assigned_clusterWord, centroids_of_clusters], axis=1)
    result = result.drop_duplicates()

    return result


# generating names for cluster count
def generate_cluster_names(sequence_names, cluster_cnt=100):

    words_dict = {}

    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+str(i) for i in range(cluster_cnt)]

    return words_dict


def cluster_word_sort(axis_clusters, cluster_names):

    result = axis_clusters.loc[(
        axis_clusters['cluster_word'] == cluster_names)]

    return result.iloc[:, 1:]


def perform_clustering(statistics_train, statistics_test, channels, cluster_cnts, words_generation_flag=False):

    centroid_statistic = []

    words_dict = generate_cluster_names(channels, cluster_cnts)

    for statistic_train_df, statistic_test_df, axis in zip(statistics_train, statistics_test, channels):

        cluster_names = words_dict[axis]
        axis_clusters = clustering(
            statistic_train_df, statistic_test_df, axis, cluster_cnts, cluster_names)

        for j in range(len(cluster_names)):

            cluster_stats = cluster_word_sort(axis_clusters, cluster_names[j])
            centroid_statistic.append(cluster_stats)
            words_embedding_dict[cluster_names[j]] = cluster_stats.values[0].tolist()

    # stop words generation
    stop_words_generation(channels)
    if words_generation_flag:
        # new words generations inter sensor channels for train
        new_words_generation(channels, flag_train=True)
        # new words generations inter sensor channels for test
        new_words_generation(channels)
        embeddings_filepath = os.getcwd(
        ) + f'/../data/sub_sequence_output/word_embeddings_from_clusters.json'
        with open(embeddings_filepath, 'w') as fp:
            json.dump(words_embedding_dict, fp)
    else:

        embeddings_filepath = os.getcwd(
        ) + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
        pd.concat(centroid_statistic).to_csv(
            embeddings_filepath, index=False, header=False)

    # writing train documents to text files
    write_clustering_output(sensory_words_traindf.columns[2:], flag_train=True)
    # writing test documents to text files
    write_clustering_output(sensory_words_testdf.columns[2:])

    print(f'Finished generate_subsequences_uci_har  : {cluster_cnts} ')

In [33]:
def stop_words_generation(channels):

    stop_word_list = []

    def each_channel(channel):

        stopwords = sensory_words_traindf[[channel, 'activityID']].groupby(channel)[
            'activityID'].nunique()
        stopwords = stopwords[stopwords > 3].keys().tolist()

        return stopwords

    for channel in channels:

        stop_word_list.extend(each_channel(channel))

    with open(os.getcwd() + f'/../data/stopwords.pkl', 'wb') as f:
        pickle.dump(stop_word_list, f)


def write_clustering_output(channels, flag_train=False):

    if flag_train:

        # _combine individual words as documents
        sensory_words_traindf['final_sub_sequence'] = sensory_words_traindf[channels].apply(
            lambda row: ' '.join(row.values.astype(str)), axis=1)
        # _save the combined values to text files
        for subject in sensory_words_traindf['subject_id'].unique():
            activity = sensory_words_traindf.loc[(
                sensory_words_traindf['subject_id'] == subject)]['activityID'].values[0]
            output_filepath = os.getcwd() + f'/../data/documents/train/activity_subseq_' + \
                str(subject) + '_' + str(activity) + '.txt'
            sensory_words_traindf.loc[(sensory_words_traindf['subject_id'] == subject)][[
                'final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header=False)

    else:

        # _combine individual words as documents
        sensory_words_testdf['final_sub_sequence'] = sensory_words_testdf[channels].apply(
            lambda row: ' '.join(row.values.astype(str)), axis=1)
        # _save the combined values to text files
        for subject in sensory_words_testdf['subject_id'].unique():
            activity = sensory_words_testdf.loc[(
                sensory_words_testdf['subject_id'] == subject)]['activityID'].values[0]
            output_filepath = os.getcwd() + f'/../data/documents/test/activity_subseq_' + \
                str(subject) + '_' + str(activity) + '.txt'
            sensory_words_testdf.loc[(sensory_words_testdf['subject_id'] == subject)][[
                'final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header=False)


def form_words4(row, flag_train=False):

    temp = []
    temp = row.values
    if flag_train:
        words_embedding_dict[temp[0]+temp[1]+temp[2]+temp[3]] = [v1+v2+v3+v4
                                                                 for v1, v2, v3, v4 in zip(words_embedding_dict[temp[0]], words_embedding_dict[temp[1]], words_embedding_dict[temp[2]], words_embedding_dict[temp[2]])]

    return ''.join(temp.astype(str))


def form_words3(row, flag_train=False):

    temp = []
    temp = row.values
    if flag_train:
        words_embedding_dict[temp[0]+temp[1]+temp[2]] =  [x + y + z
        for x, y, z in zip(words_embedding_dict[temp[0]], words_embedding_dict[temp[1]], words_embedding_dict[temp[2]])]

    return ''.join(temp.astype(str))

def new_words_generation(channels, flag_train=False):

    word_combinations = [['X1', 'Y1', 'Y2', 'Z2'], ['X1', 'Z1', 'X2', 'Y2'], ['Y1', 'Z1', 'X2', 'Z2'], [
        'X1', 'Y1', 'X2', 'Z2'], ['X1', 'Z1', 'Y2', 'Z2'], ['Y1', 'Z1', 'X2', 'Y2']]

    # _adding four word combinations
    for idx, combinations in enumerate(word_combinations):
        acc_axis = combinations[0]
        temp = combinations[1:]
        if flag_train:
            sensory_words_traindf[acc_axis + temp[0] + temp[1] + temp[2]] = sensory_words_traindf[[
                acc_axis, temp[0], temp[1], temp[2]]].apply(lambda row: form_words4(row, flag_train), axis=1)
        else:
            sensory_words_testdf[acc_axis + temp[0] + temp[1] + temp[2]] = sensory_words_testdf[[
                acc_axis, temp[0], temp[1], temp[2]]].apply(lambda row: form_words4(row), axis=1)

    # _adding three word combinations
    # for acc_axis in channels[:3]:
    #     temp = []
    #     for gyro_axis in channels[3:]:

    #         if acc_axis[0] != gyro_axis[0]:

    #             temp.append(gyro_axis)
    #     if flag_train:
    #         sensory_words_traindf[acc_axis + temp[0] + temp[1]] = sensory_words_traindf[[acc_axis, temp[0], temp[1]]].apply(lambda row: form_words3(row, flag_train), axis=1)
    #     else:
    #         sensory_words_testdf[acc_axis + temp[0] + temp[1]] = sensory_words_testdf[[acc_axis, temp[0], temp[1]]].apply(lambda row: form_words3(row), axis=1)


def load_train_test_data(input_file_path, col_names):

    main_df = pd.read_csv(input_file_path, names=col_names)
    main_df = main_df.astype({'subject_id': int, 'activityID': int})

    return main_df


def window_sampling(main_df, window_length, window_overlap):

    max_window_index = len(main_df.index)
    num_of_subsequences = len(col_names)
    sub_sequences = [[] for x in range(num_of_subsequences)]

    window_index = 0

    while window_index <= (max_window_index - window_length):

        activity_sequence = main_df[col_names[1]
                                    ][window_index:window_index+window_length].tolist()
        subject_sequence = main_df[col_names[0]
                                   ][window_index:window_index+window_length].tolist()

        if len(set(subject_sequence)) == 1:
            sub_sequences[1].append(activity_sequence[0])
            sub_sequences[0].append(subject_sequence[0])

            for idx in range(2, num_of_subsequences):
                sub_sequences[idx].append(
                    main_df[col_names[idx]][window_index:window_index+window_length].tolist())

        window_index += window_overlap

    # _converting into numpy arrays
    np_sequences = np.asarray(sub_sequences[2:])
    subject_activity_seq = np.asarray(sub_sequences[:2])

    return subject_activity_seq, np_sequences


def feature_sum(vec_list):
    vec_list = np.array(vec_list)
    vec_sum = vec_list[0]

    for idx in range(1, len(vec_list)):
        vec_sum += vec_list[idx]

    return vec_sum.tolist()


def onehotenocding_pooling(subject_activity_seq, subsequences, pooling_size=3):

    max_window_index = subject_activity_seq.shape[1]
    num_of_subsequences = len(col_names)
    pooled_features = [[] for x in range(num_of_subsequences)]
    window_index = 0

    while window_index <= (max_window_index - pooling_size):

        activity_sequence = subject_activity_seq[1
                                                 ][window_index:window_index+pooling_size].tolist()
        subject_sequence = subject_activity_seq[0
                                                ][window_index:window_index+pooling_size].tolist()

        if len(set(subject_sequence)) == 1:
            pooled_features[1].append(activity_sequence[0])
            pooled_features[0].append(subject_sequence[0])

            for idx in range(2, num_of_subsequences):
                pooled_features[idx].append(
                    feature_sum(subsequences[idx-2][window_index:window_index+pooling_size]))

        window_index += pooling_size

    # _converting into numpy arrays
    np_sequences = np.asarray(pooled_features[2:])
    subject_activity_seq = np.asarray(pooled_features[:2])

    return subject_activity_seq, np_sequences


def get_kmeans_clusters(sub_sequence_train, sub_sequence_test, feature_dim):

    model = KMeans(n_clusters=feature_dim,
                   random_state=5).fit(sub_sequence_train)
    cluster_train_ids = pd.DataFrame(model.predict(
        sub_sequence_train), columns=['cluster ID'])
    cluster_test_ids = pd.DataFrame(model.predict(
        sub_sequence_test), columns=['cluster ID'])

    one_hot_features_train = pd.get_dummies(cluster_train_ids.astype('str'))
    one_hot_features_test = pd.get_dummies(cluster_test_ids.astype('str'))

    return one_hot_features_train.values.tolist(), one_hot_features_test.values.tolist()

In [34]:
feature_dim = 5
pooling_size = 2
cluster_cnts = 8
window_length = 16
window_overlap = 16

train_file_path = os.getcwd() + f'/../data/output_csv/processed_data_train.csv'
test_file_path = os.getcwd() + f'/../data/output_csv/processed_data_test.csv'
col_names = ['subject_id', 'activityID',
             'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

train_df = load_train_test_data(train_file_path, col_names)
test_df = load_train_test_data(test_file_path, col_names)

print('Starting windowing features....')

train_subject_activity_seq, train_subsequences = window_sampling(
    train_df, window_length=window_length, window_overlap=window_overlap)
test_subject_activity_seq, test_subsequences = window_sampling(
    test_df, window_length=window_length, window_overlap=window_overlap)

assert train_subject_activity_seq.shape[1] == train_subsequences.shape[1]
assert test_subject_activity_seq.shape[1] == test_subsequences.shape[1]

pooled_features_train = []
pooled_features_test = []

print('Starting feature clustering and pooling ....')
for idx in range(6):
    train_data, test_data = get_kmeans_clusters(
        train_subsequences[idx], test_subsequences[idx], feature_dim)

    pooled_features_train.append(train_data)
    pooled_features_test.append(test_data)

pooled_features_train = np.array(pooled_features_train)
pooled_features_test = np.array(pooled_features_test)

train_subject_activity_seq_pooled, train_subsequences_pooled = onehotenocding_pooling(
    train_subject_activity_seq, pooled_features_train, pooling_size)
test_subject_activity_seq_pooled, test_subsequences_pooled = onehotenocding_pooling(
    test_subject_activity_seq, pooled_features_test, pooling_size)

assert train_subject_activity_seq_pooled.shape[1] == train_subsequences_pooled.shape[1]
assert test_subject_activity_seq_pooled.shape[1] == test_subsequences_pooled.shape[1]

sensory_words_traindf['subject_id'] = train_subject_activity_seq_pooled[0]
sensory_words_traindf['activityID'] = train_subject_activity_seq_pooled[1]

sensory_words_testdf['subject_id'] = test_subject_activity_seq_pooled[0]
sensory_words_testdf['activityID'] = test_subject_activity_seq_pooled[1]

print(
    f'Finished feature extraction  : {cluster_cnts}, {window_length}, {window_overlap} ')

print(
    f'Starting Clustering  : {cluster_cnts}, {window_length}, {window_overlap} ')

perform_clustering(train_subsequences_pooled, test_subsequences_pooled,
                   channels=col_names[2:], cluster_cnts=cluster_cnts, words_generation_flag=True)

print('Ending histogram features....')

Starting windowing features....
Starting feature clustering and pooling ....
Finished feature extraction  : 8, 16, 16 
Starting Clustering  : 8, 16, 16 
Finished generate_subsequences_uci_har  : 8 
Ending histogram features....


In [35]:
sensory_words_traindf.head()

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,X1Y1Y2Z2,X1Z1X2Y2,Y1Z1X2Z2,X1Y1X2Z2,X1Z1Y2Z2,Y1Z1X2Y2,final_sub_sequence
0,1,5,X1_0,Y1_4,Z1_5,X2_0,Y2_0,Z2_0,X1_0Y1_4Y2_0Z2_0,X1_0Z1_5X2_0Y2_0,Y1_4Z1_5X2_0Z2_0,X1_0Y1_4X2_0Z2_0,X1_0Z1_5Y2_0Z2_0,Y1_4Z1_5X2_0Y2_0,X1_0 Y1_4 Z1_5 X2_0 Y2_0 Z2_0 X1_0Y1_4Y2_0Z2_0...
1,1,5,X1_0,Y1_4,Z1_5,X2_0,Y2_0,Z2_0,X1_0Y1_4Y2_0Z2_0,X1_0Z1_5X2_0Y2_0,Y1_4Z1_5X2_0Z2_0,X1_0Y1_4X2_0Z2_0,X1_0Z1_5Y2_0Z2_0,Y1_4Z1_5X2_0Y2_0,X1_0 Y1_4 Z1_5 X2_0 Y2_0 Z2_0 X1_0Y1_4Y2_0Z2_0...
2,1,5,X1_0,Y1_4,Z1_5,X2_0,Y2_0,Z2_0,X1_0Y1_4Y2_0Z2_0,X1_0Z1_5X2_0Y2_0,Y1_4Z1_5X2_0Z2_0,X1_0Y1_4X2_0Z2_0,X1_0Z1_5Y2_0Z2_0,Y1_4Z1_5X2_0Y2_0,X1_0 Y1_4 Z1_5 X2_0 Y2_0 Z2_0 X1_0Y1_4Y2_0Z2_0...
3,1,5,X1_0,Y1_4,Z1_5,X2_0,Y2_0,Z2_0,X1_0Y1_4Y2_0Z2_0,X1_0Z1_5X2_0Y2_0,Y1_4Z1_5X2_0Z2_0,X1_0Y1_4X2_0Z2_0,X1_0Z1_5Y2_0Z2_0,Y1_4Z1_5X2_0Y2_0,X1_0 Y1_4 Z1_5 X2_0 Y2_0 Z2_0 X1_0Y1_4Y2_0Z2_0...
4,2,5,X1_0,Y1_4,Z1_5,X2_0,Y2_0,Z2_0,X1_0Y1_4Y2_0Z2_0,X1_0Z1_5X2_0Y2_0,Y1_4Z1_5X2_0Z2_0,X1_0Y1_4X2_0Z2_0,X1_0Z1_5Y2_0Z2_0,Y1_4Z1_5X2_0Y2_0,X1_0 Y1_4 Z1_5 X2_0 Y2_0 Z2_0 X1_0Y1_4Y2_0Z2_0...


In [36]:
sensory_words_traindf[['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2', 'activityID']].groupby('activityID')['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2'].nunique()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,X1,Y1,Z1,X2,Y2,Z2
activityID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7,6,7,8,8,8
2,7,7,7,8,8,8
3,7,6,7,8,8,8
4,3,5,8,7,8,8
5,3,5,7,8,7,8
6,1,7,8,8,6,6


In [37]:
sensory_words_traindf_1 = sensory_words_traindf.loc[(sensory_words_traindf['activityID'] == 1)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_traindf_2 = sensory_words_traindf.loc[(sensory_words_traindf['activityID'] == 2)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_traindf_3 = sensory_words_traindf.loc[(sensory_words_traindf['activityID'] == 3)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_traindf_4 = sensory_words_traindf.loc[(sensory_words_traindf['activityID'] == 4)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_traindf_5 = sensory_words_traindf.loc[(sensory_words_traindf['activityID'] == 5)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_traindf_6 = sensory_words_traindf.loc[(sensory_words_traindf['activityID'] == 6)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]

In [38]:
sensory_words_traindf_1.value_counts()

X1    Y1    Z1    X2    Y2    Z2  
X1_5  Y1_5  Z1_4  X2_4  Y2_0  Z2_5    14
      Y1_4  Z1_5  X2_4  Y2_0  Z2_1    14
            Z1_4  X2_3  Y2_6  Z2_2    14
            Z1_5  X2_3  Y2_0  Z2_2    13
X1_2  Y1_1  Z1_5  X2_6  Y2_2  Z2_3    13
                                      ..
X1_3  Y1_4  Z1_2  X2_6  Y2_7  Z2_4     1
X1_4  Y1_5  Z1_1  X2_3  Y2_6  Z2_5     1
                  X2_6  Y2_2  Z2_0     1
X1_3  Y1_4  Z1_1  X2_7  Y2_1  Z2_4     1
X1_5  Y1_4  Z1_1  X2_6  Y2_2  Z2_7     1
Length: 2028, dtype: int64

In [39]:
sensory_words_traindf_2.value_counts()

X1    Y1    Z1    X2    Y2    Z2  
X1_3  Y1_1  Z1_1  X2_5  Y2_6  Z2_2    13
X1_4  Y1_5  Z1_4  X2_2  Y2_7  Z2_1    11
            Z1_6  X2_0  Y2_2  Z2_3     8
X1_5  Y1_5  Z1_1  X2_7  Y2_1  Z2_0     8
      Y1_1  Z1_3  X2_4  Y2_7  Z2_1     8
                                      ..
X1_3  Y1_5  Z1_2  X2_7  Y2_2  Z2_6     1
X1_5  Y1_1  Z1_5  X2_2  Y2_5  Z2_3     1
X1_2  Y1_1  Z1_5  X2_4  Y2_1  Z2_0     1
X1_5  Y1_5  Z1_4  X2_3  Y2_5  Z2_4     1
      Y1_1  Z1_5  X2_0  Y2_1  Z2_4     1
Length: 2102, dtype: int64

In [40]:
sensory_words_traindf_3.value_counts()

X1    Y1    Z1    X2    Y2    Z2  
X1_4  Y1_4  Z1_5  X2_1  Y2_6  Z2_6    6
X1_3  Y1_5  Z1_3  X2_3  Y2_6  Z2_2    6
X1_2  Y1_1  Z1_1  X2_2  Y2_7  Z2_4    6
      Y1_4  Z1_5  X2_4  Y2_2  Z2_2    6
      Y1_1  Z1_3  X2_5  Y2_5  Z2_1    6
                                     ..
X1_4  Y1_4  Z1_7  X2_0  Y2_4  Z2_7    1
      Y1_1  Z1_5  X2_2  Y2_2  Z2_0    1
                        Y2_1  Z2_3    1
X1_5  Y1_1  Z1_5  X2_4  Y2_3  Z2_2    1
            Z1_4  X2_7  Y2_1  Z2_5    1
Length: 2049, dtype: int64

In [41]:
sensory_words_traindf_4.value_counts()

X1    Y1    Z1    X2    Y2    Z2  
X1_0  Y1_2  Z1_2  X2_0  Y2_0  Z2_0    1256
            Z1_5  X2_0  Y2_0  Z2_0    1077
X1_6  Y1_0  Z1_2  X2_0  Y2_0  Z2_0     512
X1_0  Y1_2  Z1_4  X2_0  Y2_0  Z2_0     415
      Y1_4  Z1_4  X2_0  Y2_0  Z2_0     296
                                      ... 
                  X2_4  Y2_0  Z2_7       1
      Y1_0  Z1_2  X2_7  Y2_6  Z2_7       1
      Y1_4  Z1_5  X2_1  Y2_7  Z2_7       1
                  X2_7  Y2_0  Z2_7       1
      Y1_2  Z1_7  X2_0  Y2_5  Z2_0       1
Length: 125, dtype: int64

In [42]:
sensory_words_traindf_5.value_counts()

X1    Y1    Z1    X2    Y2    Z2  
X1_0  Y1_4  Z1_5  X2_0  Y2_0  Z2_0    2089
            Z1_4  X2_0  Y2_0  Z2_0     698
      Y1_5  Z1_4  X2_0  Y2_0  Z2_0     639
            Z1_5  X2_0  Y2_0  Z2_0     525
      Y1_2  Z1_5  X2_0  Y2_0  Z2_0     355
                                      ... 
      Y1_5  Z1_4  X2_0  Y2_0  Z2_7       1
                  X2_1  Y2_0  Z2_1       1
                        Y2_6  Z2_0       1
      Y1_2  Z1_5  X2_3  Y2_7  Z2_0       1
            Z1_7  X2_0  Y2_7  Z2_7       1
Length: 115, dtype: int64

In [43]:
sensory_words_traindf_6.value_counts()

X1    Y1    Z1    X2    Y2    Z2  
X1_1  Y1_0  Z1_0  X2_0  Y2_0  Z2_0    1881
      Y1_3  Z1_0  X2_0  Y2_0  Z2_0    1477
            Z1_2  X2_0  Y2_0  Z2_0     782
            Z1_5  X2_0  Y2_0  Z2_0     295
            Z1_3  X2_0  Y2_0  Z2_0     160
                                      ... 
      Y1_0  Z1_0  X2_6  Y2_6  Z2_1       1
      Y1_3  Z1_7  X2_1  Y2_0  Z2_1       1
            Z1_5  X2_7  Y2_3  Z2_7       1
            Z1_2  X2_1  Y2_3  Z2_1       1
            Z1_3  X2_1  Y2_0  Z2_5       1
Length: 139, dtype: int64

In [62]:
sensory_words_traindf_1['X1'].value_counts()

X1_5    1288
X1_0    1056
X1_3     773
X1_7     700
X1_2     575
X1_4     498
X1_6      14
Name: X1, dtype: int64

In [63]:
sensory_words_traindf_1['Y1'].value_counts()

Y1_1    1944
Y1_4    1779
Y1_5     526
Y1_6     463
Y1_2     109
Y1_7      83
Name: Y1, dtype: int64

In [65]:
sensory_words_traindf_1['Z1'].value_counts()

Z1_5    1460
Z1_4    1375
Z1_1     932
Z1_6     472
Z1_7     323
Z1_2     176
Z1_3     166
Name: Z1, dtype: int64

In [47]:
sensory_words_testdf_1 = sensory_words_testdf.loc[(sensory_words_testdf['activityID'] == 1)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_testdf_2 = sensory_words_testdf.loc[(sensory_words_testdf['activityID'] == 2)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_testdf_3 = sensory_words_testdf.loc[(sensory_words_testdf['activityID'] == 3)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_testdf_4 = sensory_words_testdf.loc[(sensory_words_testdf['activityID'] == 4)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_testdf_5 = sensory_words_testdf.loc[(sensory_words_testdf['activityID'] == 5)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]
sensory_words_testdf_6 = sensory_words_testdf.loc[(sensory_words_testdf['activityID'] == 6)][['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']]

In [48]:
sensory_words_testdf_4['X1'].value_counts()

X1_0    1664
X1_6     279
X1_5      21
Name: X1, dtype: int64

In [49]:
sensory_words_testdf_5['X1'].value_counts()

X1_0    2128
Name: X1, dtype: int64

In [50]:
sensory_words_testdf_6['X1'].value_counts()

X1_1    2148
Name: X1, dtype: int64

In [78]:
sensory_words_traindf['Z2'].value_counts()

Z1_5    5377
Z1_0    3958
Z1_2    3450
Z2_2    2879
Z1_4    2804
Z2_3    1845
Z2_1    1739
Z2_5    1711
Z2_6    1435
Z2_4    1365
Z2_7    1322
Z1_3     752
Z1_1     417
Z1_7     263
Z1_6      91
Name: Z2, dtype: int64

In [69]:
type(list(sensory_words_traindf['Y2'].value_counts()))

list

In [None]:
Z2_0 -> _, Y2_0 -> _

In [71]:
import collections

In [73]:
Y2_least_idf = collections.Counter(sensory_words_traindf['Y2'].tolist()).most_common()[0][0]
Z2_least_idf = collections.Counter(sensory_words_traindf['Z2'].tolist()).most_common()[0][0]

print(f'{Y2_least_idf} -- {Z2_least_idf}')

Y2_0 -- Z2_0


In [76]:
def get_replacement_word(channel_values, replaceword):

    if channel_values[1] == replaceword:
        return channel_values[0]
    return channel_values[1]


def replace_leastidf_values():

    Y2_least_idf = collections.Counter(
        sensory_words_traindf['Y2'].tolist()).most_common()[0][0]
    Z2_least_idf = collections.Counter(
        sensory_words_traindf['Z2'].tolist()).most_common()[0][0]

    sensory_words_traindf['Y2'] = sensory_words_traindf[['Y1', 'Y2']].apply(
        lambda row: get_replacement_word(row.values, Y2_least_idf), axis=1)

    sensory_words_traindf['Z2'] = sensory_words_traindf[['Z1', 'Z2']].apply(
        lambda row: get_replacement_word(row.values, Z2_least_idf), axis=1)

In [77]:
replace_leastidf_values()