In [None]:
import pandas as pd
import numpy as np
import os 
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
here = Path.cwd()
root = next(p for p in [here, *here.parents] if (p / "data" / "processed").exists())
df_ios = pd.read_csv(root / "data" / "processed" / "ios.csv", header=0)
df_android = pd.read_csv(root / "data" / "processed" / "android.csv", header=0)
df_ios = df_ios[df_ios['eventSensorDetectionMthd'] == 1]
df_android = df_android[df_android['eventSensorDetectionMthd'] == 1]

In [None]:
print(df_ios.columns.tolist())
print(df_android.columns.tolist())

In [None]:
# columns to keep (with the earlier-dropped ones removed)
keep_cols = [
    "eventType",
    "eventSampleSpeed",
    "eventDurationSeconds",
    "eventMilesDriven",
    "eventSensorDetectionMthd",
    "eventGPSSignalStrength",
    "eventStartSpeed",
    "eventEndSpeed",
    "memsMedianHorizontalNorm",
    "memsP75HorizontalNorm",
    "memsP95HorizontalNorm",
    "gyro_angular_change_per_second",
    "gyroAngleChange",
    "mems_radius"
]

# keep only these columns
df_ios = df_ios[[c for c in keep_cols if c in df_ios.columns]]
df_android = df_android[[c for c in keep_cols if c in df_android.columns]]

print(df_ios.columns.to_list())
print(df_android.columns.to_list())

In [None]:
print(df_ios.shape)
print(df_android.shape)

In [None]:
df_ios['is_ios'] = 1
df_android['is_ios'] = 0
df = pd.concat([df_ios, df_android], ignore_index=True)
df.groupby("is_ios")["memsMedianHorizontalNorm"].describe()

In [None]:
plt.figure(figsize=(6,4))

# Split data
ios_data = df_ios['memsMedianHorizontalNorm']
android_data = df_android['memsMedianHorizontalNorm']

# Make two horizontal boxplots side by side
plt.boxplot([ios_data, android_data],
            vert=False,
            labels=["iOS", "Android"],
            patch_artist=True,   # lets us color them
            boxprops=dict(facecolor="lightblue", color="blue"),
            medianprops=dict(color="red"),
            )

plt.title("Boxplot of memsMedianHorizontalNorm by Source")
plt.xlabel("memsMedianHorizontalNorm")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    data=df, 
    x="memsMedianHorizontalNorm", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of memsMedianHorizontalNorm by Source (Percent within Source)")
plt.xlabel("memsMedianHorizontalNorm")
plt.ylabel("Percent of Source")
plt.show()

In [None]:
print(df_ios.shape[0])
df_ios = df_ios[df_ios['memsMedianHorizontalNorm'] < 7]
print(df_ios.shape[0])

In [None]:
plt.figure(figsize=(6,4))

# Split data
ios_data = df_ios['memsP75HorizontalNorm']
android_data = df_android['memsP75HorizontalNorm']

# Make two horizontal boxplots side by side
plt.boxplot([ios_data, android_data],
            vert=False,
            labels=["iOS", "Android"],
            patch_artist=True,   # lets us color them
            boxprops=dict(facecolor="lightblue", color="blue"),
            medianprops=dict(color="red"),
            )

plt.title("Boxplot of memsP75HorizontalNorm by Source")
plt.xlabel("memsP75HorizontalNorm")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    data=df, 
    x="memsP75HorizontalNorm", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of memsP75HorizontalNorm by Source (Percent within Source)")
plt.xlabel("memsP75HorizontalNorm")
plt.ylabel("Percent of Source")
plt.show()

In [None]:
df_ios = df_ios[df_ios['memsP75HorizontalNorm'] < 8]
df_ios.shape[0]

In [None]:
plt.figure(figsize=(6,4))

# Split data
ios_data = df_ios['memsP95HorizontalNorm']
android_data = df_android['memsP95HorizontalNorm']

# Make two horizontal boxplots side by side
plt.boxplot([ios_data, android_data],
            vert=False,
            labels=["iOS", "Android"],
            patch_artist=True,   # lets us color them
            boxprops=dict(facecolor="lightblue", color="blue"),
            medianprops=dict(color="red"),
            )

plt.title("Boxplot of memsP95HorizontalNorm by Source")
plt.xlabel("memsP95HorizontalNorm")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    data=df, 
    x="memsP95HorizontalNorm", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of memsP95HorizontalNorm by Source (Percent within Source)")
plt.xlabel("memsP95HorizontalNorm")
plt.ylabel("Percent of Source")
plt.show()

In [None]:
plt.figure(figsize=(6,4))

# Split data
ios_data = df[df['is_ios'] == 1]['gyro_angular_change_per_second']
android_data = df[df['is_ios'] == 0]['gyro_angular_change_per_second']

# Make two horizontal boxplots side by side
plt.boxplot([ios_data, android_data],
            vert=False,
            labels=["iOS", "Android"],
            patch_artist=True,   # lets us color them
            boxprops=dict(facecolor="lightblue", color="blue"),
            medianprops=dict(color="red"),
            )

plt.title("Boxplot of gyro_angular_change_per_second by Source")
plt.xlabel("gyro_angular_change_per_second")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    data=df, 
    x="gyro_angular_change_per_second", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of gyro_angular_change_per_second by Source (Percent within Source)")
plt.xlabel("gyro_angular_change_per_second")
plt.ylabel("Percent of Source")
plt.show()

In [None]:
print(df_ios.shape[0])
df_ios = df_ios[df_ios['gyro_angular_change_per_second'] < 33]
print(df_ios.shape[0])

In [None]:
plt.figure(figsize=(6,4))

# Split data
ios_data = df_ios['gyroAngleChange']
android_data = df_android['gyroAngleChange']

# Make two horizontal boxplots side by side
plt.boxplot([ios_data, android_data],
            vert=False,
            labels=["iOS", "Android"],
            patch_artist=True,   # lets us color them
            boxprops=dict(facecolor="lightblue", color="blue"),
            medianprops=dict(color="red"),
            )

plt.title("Boxplot of gyroAngleChange by Source")
plt.xlabel("gyroAngleChange")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    data=df, 
    x="gyroAngleChange", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of gyroAngleChange by Source (Percent within Source)")
plt.xlabel("gyroAngleChange")
plt.ylabel("Percent of Source")
plt.show()

In [None]:
print(df_ios.shape[0])
df_ios = df_ios[df_ios['gyroAngleChange'] < 270]
print(df_ios.shape[0])

In [None]:
plt.figure(figsize=(6,4))

# Split data
ios_data = df[df['is_ios'] == 1]['mems_radius']
android_data = df[df['is_ios'] == 0]['mems_radius']

# Make two horizontal boxplots side by side
plt.boxplot([ios_data, android_data],
            vert=False,
            labels=["iOS", "Android"],
            patch_artist=True,   # lets us color them
            boxprops=dict(facecolor="lightblue", color="blue"),
            medianprops=dict(color="red"),
            )

plt.title("Boxplot of mems_radius by Source")
plt.xlabel("mems_radius")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    data=df, 
    x="mems_radius", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of mems_radius by Source (Percent within Source)")
plt.xlabel("mems_radius")
plt.ylabel("Percent of Source")
plt.show()

In [None]:
df_ios['mems_radius'].describe()

In [None]:
df_android['mems_radius'].describe()

In [None]:
df[df['mems_radius'] > 1000]

In [None]:
# find all infinite values in mems_radius
df[np.isinf(df['mems_radius'])]

In [None]:
# Compute 95th percentile threshold
ios_subset = df_ios
android_subset = df_android

Q_ios = ios_subset['mems_radius'].quantile(0.99)
Q_android = android_subset['mems_radius'].quantile(0.99)

filtered_ios = ios_subset[ios_subset['mems_radius'] <= Q_ios].copy()
filtered_android = android_subset[android_subset['mems_radius'] <= Q_android].copy()

# ---------------------- BOX PLOT ----------------------
plt.figure(figsize=(6,4))

ios_radius = filtered_ios['mems_radius']
android_radius = filtered_android['mems_radius']
new_df = pd.concat([filtered_ios, filtered_android], ignore_index=True)

plt.boxplot(
    [ios_radius, android_radius],
    vert=False,
    labels=["iOS", "Android"],
    patch_artist=True,   # enables color fill
    boxprops=dict(facecolor="lightblue", color="blue"),
    medianprops=dict(color="red"),
)

plt.title("Boxplot of mems_radius by Source (Below 99th Percentile)")
plt.xlabel("mems_radius")
plt.show()

# ---------------------- HISTOGRAM ----------------------
plt.figure(figsize=(8,5))
sns.histplot(
    data=new_df, 
    x="mems_radius", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of mems_radius by Source (Below 99th Percentile)")
plt.xlabel("mems_radius")
plt.ylabel("Percent of Source")
plt.show()

print(f"Original size: {df.shape[0]}\nNew size: {new_df.shape[0]}\nDifference: {df.shape[0] - new_df.shape[0]}")


In [None]:
# Compute 95th percentile threshold
ios_subset = df_ios
android_subset = df_android

Q_ios = ios_subset['mems_radius'].quantile(0.95)
Q_android = android_subset['mems_radius'].quantile(0.95)

filtered_ios = ios_subset[ios_subset['mems_radius'] <= Q_ios].copy()
filtered_android = android_subset[android_subset['mems_radius'] <= Q_android].copy()

# ---------------------- BOX PLOT ----------------------
plt.figure(figsize=(6,4))

ios_radius = filtered_ios['mems_radius']
android_radius = filtered_android['mems_radius']
new_df = pd.concat([filtered_ios, filtered_android], ignore_index=True)

plt.boxplot(
    [ios_radius, android_radius],
    vert=False,
    labels=["iOS", "Android"],
    patch_artist=True,   # enables color fill
    boxprops=dict(facecolor="lightblue", color="blue"),
    medianprops=dict(color="red"),
)

plt.title("Boxplot of mems_radius by Source (Below 99th Percentile)")
plt.xlabel("mems_radius")
plt.show()

# ---------------------- HISTOGRAM ----------------------
plt.figure(figsize=(8,5))
sns.histplot(
    data=new_df, 
    x="mems_radius", 
    hue="is_ios", 
    stat="percent",        
    common_norm=False,     
    bins=30, 
    edgecolor="black"
)

plt.title("Histogram of mems_radius by Source (Below 99th Percentile)")
plt.xlabel("mems_radius")
plt.ylabel("Percent of Source")
plt.show()

print(f"Original size: {df.shape[0]}\nNew size: {new_df.shape[0]}\nDifference: {df.shape[0] - new_df.shape[0]}")


In [None]:
# Compute 99th percentile threshold
Q_ios = df_ios['mems_radius'].quantile(0.99)
Q_android = df_android['mems_radius'].quantile(0.99)

df_ios = df_ios[df_ios['mems_radius'] <= Q_ios]
df_android = df_android[df_android['mems_radius'] <= Q_android]

print(df_ios.shape)
print(df_android.shape)
df = pd.concat([df_ios, df_android], ignore_index=True)
print(df.shape)


In [None]:
# Keep only numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlations
corr_matrix = numeric_df.corr()

# Plot heatmap
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix of Numeric Features")
plt.show()

In [None]:
df.drop(columns=["memsP75HorizontalNorm", "memsP95HorizontalNorm", "eventSensorDetectionMthd"], inplace=True)

In [None]:
# Keep only numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlations
corr_matrix = numeric_df.corr()

# Plot heatmap
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix of Numeric Features")
plt.show()

In [None]:
df.to_csv(root / "data" / "processed" / "merged"/ "state1_clean.csv", index=False) 