In [None]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from data_exploration import *

In [None]:
# Setting Pandas Display Options
pd.set_option('display.max_rows', 200) 
pd.set_option('display.max_columns', 100) 

In [None]:
#Plotting format
plt.style.use('seaborn-v0_8-bright')
plt.style.use('seaborn-v0_8-darkgrid')

In [None]:

df = pd.read_csv('dataset_mood_smartphone.csv')

## Sorting by ID and time
---

In [None]:
df = df.sort_values(by=["id","time"],ascending=[True,True])
df.head(200)

## Date & Time split
---

* Seperate Date and time into seperate columns

In [None]:
#Rename time column
df['time'] = df.rename(columns={'time':'date_time'},inplace= True)

#Convert column to a dateTime format
df['date_time'] = pd.to_datetime(df['date_time'])

#Create a new column called "date"
df['date'] = df['date_time'].dt.date

#Create a new column called 'time'
df['time'] = df['date_time'].dt.time

In [None]:
df.head(50000)

* Identify strange time values

In [None]:
#Convert time column to String for now
df['time'] = df['time'].astype("str")

#Defining the stabndard time format HH:MM:SS using a regular expression 
standard_time_format = r'^\d{2}:\d{2}:\d{2}$'

#A boolean variable that contains a column of true or false. In this case, an entry will contain true if the time format DOES NOT meet the format defined in the Regex above
boolean_mask = ~df['time'].str.match(standard_time_format)

#Filtering the time column by values that don't meet the standard time format
df['time'][boolean_mask]

## Seperation into columns
---

### Pivot

In [None]:
#USING NORMAL PIVOT WITH NO AGGREGATION
#Storing each attribute in a seperate column
pivot = df.pivot(index='Unnamed: 0', columns='variable', values='value').reset_index()


#Merge the pivoted data back into the original DataFrame
df_pivot = pd.merge(df, pivot, on='Unnamed: 0', how='left')
# df_merged

df_pivot

## Remove unnecessary columns
---

In [None]:
df_pivot.columns

* ### Remove Unecessary columns:
    * Remove "appCat.utilities"
    * Remove "appCat.unknown"
    * Remove "appCat.builtin" ~ Apps like camera, calendar etc.
    * Remove 'appCat.other'
    * Remove "Value" ~Since we have now added these under their relevant columns
    * Remove "variable" ~Since we have now created columns from this attribute

In [None]:
df_pivot_subset = df_pivot[['Unnamed: 0', 'id', 'date_time', 'mood', 'time', 'date',
       'activity', 'appCat.communication',
       'appCat.entertainment', 'appCat.finance', 'appCat.game',
       'appCat.office', 'appCat.social', 'appCat.travel',
       'appCat.weather', 'call',
       'circumplex.arousal', 'circumplex.valence', 'screen', 'sms']]

df_pivot_subset_sorted = df_pivot_subset.sort_values(by="id",ascending=True)
df_pivot_subset_sorted

# Analysis
---

* ### Data types

In [None]:
df_pivot_subset.dtypes

* ### Description

In [None]:
df_pivot_subset.describe()

* ### Null values

In [None]:
df_pivot_subset.isna().sum()

* ### Distribution 

In [None]:
df_pivot_subset.hist(bins = 30, figsize=(15,15))

* ### Outlier detection

In [None]:

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 8))  

# Row 1
axes[0,0].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.communication'])
axes[0,0].set_xlabel('id', fontweight='bold')
axes[0,0].set_ylabel('appCat.communication', fontweight='bold')

axes[0,1].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.entertainment'])
axes[0,1].set_xlabel('id', fontweight='bold')
axes[0,1].set_ylabel('appCat.entertainment', fontweight='bold')

# axes[0,2].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.social'])
# axes[0,2].set_xlabel('id', fontweight='bold')
# axes[0,2].set_ylabel('appCat.social', fontweight='bold')

axes[0,2].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.finance'])
axes[0,2].set_xlabel('id', fontweight='bold')
axes[0,2].set_ylabel('appCat.finance', fontweight='bold')

# Row 2

axes[1,0].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.game'])
axes[1,0].set_xlabel('id', fontweight='bold')
axes[1,0].set_ylabel('appCat.game', fontweight='bold')

axes[1,1].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.office'])
axes[1,1].set_xlabel('id', fontweight='bold')
axes[1,1].set_ylabel('appCat.office', fontweight='bold')

axes[1,2].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.social'])
axes[1,2].set_xlabel('id', fontweight='bold')
axes[1,2].set_ylabel('appCat.social', fontweight='bold')

# Row 3

axes[2,0].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.travel'])
axes[2,0].set_xlabel('id', fontweight='bold')
axes[2,0].set_ylabel('appCat.travel', fontweight='bold')

axes[2,1].scatter(df_pivot_subset['id'], df_pivot_subset['appCat.weather'])
axes[2,1].set_xlabel('id', fontweight='bold')
axes[2,1].set_ylabel('appCat.weather', fontweight='bold')

axes[2,2].scatter(df_pivot_subset['id'], df_pivot_subset['screen'])
axes[2,2].set_xlabel('id', fontweight='bold')
axes[2,2].set_ylabel('screen', fontweight='bold')





plt.tight_layout()

plt.show()

## Outlier removal
---

Paper: https://www.sciencedirect.com/science/article/pii/S1574013720304068

* ### Remove using IQR

In [None]:
remove_outliers(df_pivot_subset, range= 3)

## Time series analysis

In [None]:

# Plotting all columns at once
df_pivot_subset.iloc[:, 3:].plot(subplots=True, figsize=(12, 18)) # 'subplots=True' plots each column in a separate subplot

plt.tight_layout() # Adjusts the subplots to fit into the figure area.
plt.show() # Displays the plot

In [None]:
#THIS IS THE FINAL DATA SET FOR YOU SHREYA 
#--------------------------------------------
df_pivot_subset