# 2. Data Preprocssing

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2.1. Columns Variability 

In [68]:
mov_fast = pd.read_csv('../data/processed/movement_fast_stat.csv')
mov_slow = pd.read_csv('../data/processed/movement_slow_stat.csv')
traffic_fast = pd.read_csv('../data/processed/traffic_fast_stat.csv')
traffic_slow = pd.read_csv('../data/processed/traffic_slow_stat.csv')

In [69]:
mov_fast.drop(columns=["Unnamed: 0_mean","Unnamed: 0_std","Unnamed: 0_25%","Unnamed: 0_50%", "Unnamed: 0_75%","Unnamed: 0_max","Unnamed: 0_min" ], inplace=True)
traffic_fast.drop(columns=["Unnamed: 0_mean","Unnamed: 0_std","Unnamed: 0_25%","Unnamed: 0_50%", "Unnamed: 0_75%","Unnamed: 0_max","Unnamed: 0_min" ], inplace=True)

In [70]:
def find_non_varying_variables(df):
    non_varying_columns = []
    variability_percentage = []
    
    for column in df.columns:
        unique_count = df[column].nunique()
        total_count = len(df[column])
        variability = unique_count / total_count * 100
        
        if unique_count == 1:
            non_varying_columns.append(column)
            variability_percentage.append(variability)
    
    result_df = pd.DataFrame({'Variable': non_varying_columns, 'Variability Percentage': variability_percentage})
    return result_df

In [71]:
find_non_varying_variables(mov_fast)

Unnamed: 0,Variable,Variability Percentage
0,RemoteButtons_mean,0.02791
1,RemoteButtons_std,0.02791
2,RemoteButtons_min,0.02791
3,RemoteButtons_25%,0.02791
4,RemoteButtons_50%,0.02791
...,...,...
117,Sensor2OrientationZ_min,0.02791
118,Sensor2OrientationZ_25%,0.02791
119,Sensor2OrientationZ_50%,0.02791
120,Sensor2OrientationZ_75%,0.02791


In [72]:
find_non_varying_variables(mov_slow)

Unnamed: 0,Variable,Variability Percentage
0,RemoteButtons_mean,0.027801
1,RemoteButtons_std,0.027801
2,RemoteButtons_min,0.027801
3,RemoteButtons_25%,0.027801
4,RemoteButtons_50%,0.027801
...,...,...
116,Sensor2OrientationZ_min,0.027801
117,Sensor2OrientationZ_25%,0.027801
118,Sensor2OrientationZ_50%,0.027801
119,Sensor2OrientationZ_75%,0.027801


In [73]:
find_non_varying_variables(traffic_fast)

Unnamed: 0,Variable,Variability Percentage
0,size_min,0.02791
1,size_25%,0.02791
2,size_75%,0.02791


In [74]:
find_non_varying_variables(traffic_slow)

Unnamed: 0,Variable,Variability Percentage
0,size_min,0.027778
1,size_25%,0.027778
2,size_50%,0.027778
3,size_75%,0.027778


for the movement data, the data acquired from sensor 0, 1, 2, touchbutton,and remotebuttons are constants which will have to drop from our dataset, to focus only on meangful variables, that can help us make better predicition, for the traffic data, the only constant data are the ones related to packets size, it's important first to understand why those features are constans accross all the participants/games

In [75]:
mov_fast.drop(columns=find_non_varying_variables(mov_fast)['Variable'], inplace=True)
mov_slow.drop(columns=find_non_varying_variables(mov_slow)['Variable'], inplace=True)
traffic_fast.drop(columns=find_non_varying_variables(traffic_fast)['Variable'], inplace=True)
traffic_slow.drop(columns=find_non_varying_variables(traffic_slow)['Variable'], inplace=True)

## 2.2. Missing Values

In [76]:
def missing_columns(dataframe):
    """
    Returns a dataframe that contains missing column names and 
    percent of missing values in relation to the whole dataframe.
    
    dataframe: dataframe that gives the column names and their % of missing values
    """
    
    # find the missing values
    missing_values = dataframe.isnull().sum().sort_values(ascending=False)
    
    # percentage of missing values in relation to the overall size
    missing_values_pct = 100 * missing_values/len(dataframe)
    
    # create a new dataframe which is a concatinated version
    concat_values = pd.concat([missing_values, missing_values/len(dataframe),missing_values_pct.round(1)],axis=1)

    # give new col names
    concat_values.columns = ['Missing Count','Missing Count Ratio','Missing Count %']
    
    # return the required values
    return concat_values[concat_values.iloc[:,1]!=0]

In [77]:
missing_columns(mov_fast)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [78]:
missing_columns(mov_slow)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [79]:
missing_columns(traffic_fast)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [80]:
missing_columns(traffic_slow)

Unnamed: 0,Missing Count,Missing Count Ratio,Missing Count %


In [81]:
mov_fast.columns = mov_fast.columns.str.strip()
mov_slow.columns = mov_slow.columns.str.strip()
traffic_fast.columns = traffic_fast.columns.str.strip()
traffic_slow.columns = traffic_slow.columns.str.strip()

In [82]:
mov_fast.to_csv('../data/processed/movement_fast_stat_cleaned.csv')
mov_slow.to_csv('../data/processed/movement_slow_stat_cleaned.csv')
traffic_fast.to_csv('../data/processed/traffic_fast_stat_cleaned.csv')
traffic_slow.to_csv('../data/processed/traffic_slow_stat_cleaned.csv')