1. Load the dataset into memory
1. Inspect and clean the data set. Questions to investigate include:
    - Are there any duplicate rows?
    - Are there any rows missing?
    - Are values missing?
    - Are there any invalid values such as a negative time?
1. Filter out events that occurred before 2020-01-01 00:00:00.
1. Group the events by user and event.
1. For each group, calculate:
    - the number of events that occurred
    - the average time difference between consecutive events.
1. Store the result in a new CSV file with the following format:
    ```
    user, event, number_of_events, average_time_difference
    ```

In [6]:
%%writefile sample.csv
user,event,datetime
11,Corporate events,2019-03-01 14:54:31
88,Team building event,2020-05-14 22:22:47
187,Conference,2020-08-13 21:16:20
123,Corporate events,2021-11-02 15:12:48
101,Networking event,2019-10-24 23:18:32
187,Conference,2020-08-13 21:16:20
147,Team building event,2019-01-26 18:35:34
79,Trade show,2021-06-11 18:54:42
95,Networking event,2020-06-25 12:11:10
192,Trade show,2017-07-16 18:30:05
79,Trade show,2021-06-11 18:54:42
156,Workshop,2021-01-01 07:53:11

Overwriting sample.csv


In [16]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('sample.csv', parse_dates=['datetime'])
df

Unnamed: 0,user,event,datetime
0,11,Corporate events,2019-03-01 14:54:31
1,88,Team building event,2020-05-14 22:22:47
2,187,Conference,2020-08-13 21:16:20
3,123,Corporate events,2021-11-02 15:12:48
4,101,Networking event,2019-10-24 23:18:32
5,187,Conference,2020-08-13 21:16:20
6,147,Team building event,2019-01-26 18:35:34
7,79,Trade show,2021-06-11 18:54:42
8,95,Networking event,2020-06-25 12:11:10
9,192,Trade show,2017-07-16 18:30:05


In [8]:
df.info()   ## To inspect table structure and data type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   user      12 non-null     int64         
 1   event     12 non-null     object        
 2   datetime  12 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 416.0+ bytes


In [9]:
## Are the any duplicate rows?

duplicate = df[df.duplicated()]

print("Duplicate Rows :")
 
# Print the resultant Dataframe
duplicate

Duplicate Rows :


Unnamed: 0,user,event,datetime
5,187,Conference,2020-08-13 21:16:20
10,79,Trade show,2021-06-11 18:54:42


In [10]:
df.dtypes

user                 int64
event               object
datetime    datetime64[ns]
dtype: object

In [13]:
df.isnull().any()

##Missing values are present in the event column

user        False
event       False
datetime    False
dtype: bool

In [14]:
print(" \nshow the boolean Dataframe : \n\n", df.isnull())
  
# Count total NaN in a DataFrame
print(" \nCount total NaN in a DataFrame : \n\n",
       df.isnull().sum().sum())

 
show the boolean Dataframe : 

      user  event  datetime
0   False  False     False
1   False  False     False
2   False  False     False
3   False  False     False
4   False  False     False
5   False  False     False
6   False  False     False
7   False  False     False
8   False  False     False
9   False  False     False
10  False  False     False
11  False  False     False
 
Count total NaN in a DataFrame : 

 0


In [17]:
## Are there any invalid values such as a negative time?
## How can I check for negative values in pandas dataframe which contains different data types?

s = df.select_dtypes(np.number).lt(0).any()\
      .reindex(df.columns, fill_value=False)\
      .rename_axis("col").reset_index(name='isnegative')

print(s)

        col  isnegative
0      user       False
1     event       False
2  datetime       False


In [18]:
## Replace bad charater...

df.replace(to_replace=r'^ba.$', value='new',regex=True)

Unnamed: 0,user,event,datetime
0,11,Corporate events,2019-03-01 14:54:31
1,88,Team building event,2020-05-14 22:22:47
2,187,Conference,2020-08-13 21:16:20
3,123,Corporate events,2021-11-02 15:12:48
4,101,Networking event,2019-10-24 23:18:32
5,187,Conference,2020-08-13 21:16:20
6,147,Team building event,2019-01-26 18:35:34
7,79,Trade show,2021-06-11 18:54:42
8,95,Networking event,2020-06-25 12:11:10
9,192,Trade show,2017-07-16 18:30:05


In [19]:
##Check for NaN Values in datetime column

df['datetime'].isnull().values.any()

False

In [20]:
# Check for invalid values
invalid_mask = df.applymap(lambda x: isinstance(x, str) and not x.isalnum())
print (invalid_mask)

     user  event  datetime
0   False   True     False
1   False   True     False
2   False  False     False
3   False   True     False
4   False   True     False
5   False  False     False
6   False   True     False
7   False   True     False
8   False   True     False
9   False   True     False
10  False   True     False
11  False  False     False


In [21]:
# Check for missing values
mask = df.isna()
print (mask)

     user  event  datetime
0   False  False     False
1   False  False     False
2   False  False     False
3   False  False     False
4   False  False     False
5   False  False     False
6   False  False     False
7   False  False     False
8   False  False     False
9   False  False     False
10  False  False     False
11  False  False     False


In [22]:
## Combine the two masks
bad_values_mask = mask | invalid_mask
print (bad_values_mask)

     user  event  datetime
0   False   True     False
1   False   True     False
2   False  False     False
3   False   True     False
4   False   True     False
5   False  False     False
6   False   True     False
7   False   True     False
8   False   True     False
9   False   True     False
10  False   True     False
11  False  False     False


In [23]:
##Filter out events that occurred before 2020-01-01 00:00:00.
df[df['datetime']<'2020']

Unnamed: 0,user,event,datetime
0,11,Corporate events,2019-03-01 14:54:31
4,101,Networking event,2019-10-24 23:18:32
6,147,Team building event,2019-01-26 18:35:34
9,192,Trade show,2017-07-16 18:30:05


In [24]:
#Group the events by user and event.
event_grp=df.groupby(['user','event'])
print (event_grp)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f984e5b3850>


In [25]:
##the number of events that occurred
df.groupby('user')['event'].value_counts()

user  event              
11    Corporate events       1
79    Trade show             2
88    Team building event    1
95    Networking event       1
101   Networking event       1
123   Corporate events       1
147   Team building event    1
156   Workshop               1
187   Conference             2
192   Trade show             1
Name: count, dtype: int64

In [27]:
#changing the datatype of datetime column
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

df.dropna(inplace=True)
df

Unnamed: 0,user,event,datetime
0,11,Corporate events,2019-03-01 14:54:31
1,88,Team building event,2020-05-14 22:22:47
2,187,Conference,2020-08-13 21:16:20
3,123,Corporate events,2021-11-02 15:12:48
4,101,Networking event,2019-10-24 23:18:32
5,187,Conference,2020-08-13 21:16:20
6,147,Team building event,2019-01-26 18:35:34
7,79,Trade show,2021-06-11 18:54:42
8,95,Networking event,2020-06-25 12:11:10
9,192,Trade show,2017-07-16 18:30:05


In [28]:
#sort by user and date
df.sort_values(by=['user', 'datetime'], inplace=True)
df

Unnamed: 0,user,event,datetime
0,11,Corporate events,2019-03-01 14:54:31
7,79,Trade show,2021-06-11 18:54:42
10,79,Trade show,2021-06-11 18:54:42
1,88,Team building event,2020-05-14 22:22:47
8,95,Networking event,2020-06-25 12:11:10
4,101,Networking event,2019-10-24 23:18:32
3,123,Corporate events,2021-11-02 15:12:48
6,147,Team building event,2019-01-26 18:35:34
11,156,Workshop,2021-01-01 07:53:11
2,187,Conference,2020-08-13 21:16:20


In [29]:
users = list(df['user'].unique())

avg_time_diff = []

for user in users:
    user_df = df[df['user'] == user]
    mean_time = user_df['datetime'].diff().dt.total_seconds().mean()
    avg_time_diff.append(mean_time)
    

avg_time = pd.DataFrame({'user':users, 'avg_time_bet_con_event':avg_time_diff})

avg_time

Unnamed: 0,user,avg_time_bet_con_event
0,11,
1,79,0.0
2,88,
3,95,
4,101,
5,123,
6,147,
7,156,
8,187,0.0
9,192,
