In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
colnames=['date', 'timestamp', 'request_method', 'user_id', 'cohort', 'ip']
df = pd.read_csv('curriculum-access.txt',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"'
)

In [3]:
df.head()

Unnamed: 0,date,timestamp,request_method,user_id,cohort,ip
0,2018-01-26,09:55:03,/,1,8.0,98.106.20.62
1,2018-01-26,09:56:02,java-ii,1,8.0,98.106.20.62
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,98.106.20.62
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,98.106.20.62
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,98.106.20.62


In [4]:
df.isna().sum()

date                  0
timestamp             0
request_method        1
user_id               0
cohort            27856
ip                    0
dtype: int64

In [5]:
df['date_time'] = df.date + ' ' + df.timestamp

In [6]:
df.drop(columns=['date', 'timestamp'], inplace=True)

In [7]:
df.date_time = pd.to_datetime(df.date_time)

In [8]:
df.set_index('date_time', inplace=True)

In [9]:
df.head()

Unnamed: 0_level_0,request_method,user_id,cohort,ip
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-26 09:55:03,/,1,8.0,98.106.20.62
2018-01-26 09:56:02,java-ii,1,8.0,98.106.20.62
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,98.106.20.62
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,98.106.20.62
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,98.106.20.62


In [10]:
df[df.request_method.isna() == True]

Unnamed: 0_level_0,request_method,user_id,cohort,ip
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-08 09:25:18,,586,55.0,73.178.241.52


In [11]:
df[df.user_id == 586].request_method.value_counts()

/                                                44
6-regression/1-overview                          33
3-sql/1-mysql-overview                           21
appendix/cli-git-overview                        19
appendix/git                                     15
                                                 ..
2-storytelling/3-tableau                          1
8-clustering/5-model                              1
2-storytelling/misleading3_deaths.jpg             1
4-python/7.2-intro-to-matplotlib                  1
5-stats/4.5-more-statistical-testing-examples     1
Name: request_method, Length: 131, dtype: int64

In [12]:
len(df[df.user_id == 586].ip.value_counts())

32

In [13]:
df[df.user_id == 586].reset_index().date_time.min()

Timestamp('2020-02-03 15:42:12')

In [14]:
df[df.user_id == 586].reset_index().date_time.max()

Timestamp('2020-05-15 13:57:04')

# Filling in Nulls

In [15]:
df[df.user_id == 568].request_method.mode()

0    /
dtype: object

In [16]:
df.request_method = df.request_method.fillna('/')

In [17]:
df.isna().sum()

request_method        0
user_id               0
cohort            27856
ip                    0
dtype: int64