# dates and strings - what you need to know 

In [1]:
import pandas as pd

In [2]:
import numpy as np

### to work with dates, we need a data set that contains dates

In [3]:
file = pd.read_csv('web_data.csv')
file.dtypes

web_id           int64
client_id        int64
visitor_id      object
visit_id        object
process_step    object
date_time       object
dtype: object

In [4]:
#use head to review the file 
file.head()

Unnamed: 0,web_id,client_id,visitor_id,visit_id,process_step,date_time
0,0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,4/17/17 15:27
1,1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,4/17/17 15:26
2,2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,4/17/17 15:19
3,3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,4/17/17 15:19
4,4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,4/17/17 15:18


In [5]:
#convert the date_time column to be read as a datetime data type 
file['date_time'] = pd.to_datetime(file['date_time'], errors='coerce')

In [6]:
#use head() to review the first 12 rows of the file
file.head(12)

Unnamed: 0,web_id,client_id,visitor_id,visit_id,process_step,date_time
0,0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:00
1,1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:00
2,2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:00
3,3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:00
4,4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:00
5,5,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:17:00
6,6,9988021,580560515_7732621733,781255054_21935453173_531117,step_1,2017-04-17 15:17:00
7,7,9988021,580560515_7732621733,781255054_21935453173_531117,start,2017-04-17 15:16:00
8,8,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:00
9,9,8320017,39393514_33118319366,960651974_70596002104_312201,step_3,2017-04-05 13:09:00


In [8]:
#use dtypes to see the new date time data type 
file.dtypes

web_id                   int64
client_id                int64
visitor_id              object
visit_id                object
process_step            object
date_time       datetime64[ns]
dtype: object

In [10]:
#use the day month year and index to check the month and day of the first row eg file['date_time'][0].year

file['date_time'][18].month


5

In [20]:
#pull out the time 
file['date_time'][18].time()

datetime.time(12, 24)

In [23]:
#reformat the date - example 1
file['date_time'][18].strftime(format='%d-%m-%Y')

'05-04-2017'

In [24]:
#reformat the date - example 2
file['date_time'][0].strftime(format="%A %d. %B %Y")

'Monday 17. April 2017'

In [25]:
#create a filtered data frame by a particular date 
Apr_5 = file[file['date_time'] == '2017-04-05']

In [26]:
Apr_5.head()

Unnamed: 0,web_id,client_id,visitor_id,visit_id,process_step,date_time
15403,15403,9088919,594492611_13177416992,373345047_76916836421_363244,confirm,2017-04-05
97905,97905,5171878,88910480_73022518291,390443893_82608504306_269,start,2017-04-05
97906,97906,5171878,88910480_73022518291,390443893_82608504306_269,start,2017-04-05
292632,292632,1839329,674533465_6091016367,167956133_23637791586_417756,confirm,2017-04-05
296638,296638,8367317,541357206_53611607620,445730306_83279848654_782988,step_3,2017-04-05


In [27]:
#add a month column 
file['Month'] = pd.DatetimeIndex(file['date_time']).month

In [28]:
#add a day column 
file['Day'] = pd.DatetimeIndex(file['date_time']).day

In [29]:
#Review the data frame with head() 
file.head(12)

Unnamed: 0,web_id,client_id,visitor_id,visit_id,process_step,date_time,Month,Day
0,0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:00,4,17
1,1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:00,4,17
2,2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:00,4,17
3,3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:00,4,17
4,4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:00,4,17
5,5,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:17:00,4,17
6,6,9988021,580560515_7732621733,781255054_21935453173_531117,step_1,2017-04-17 15:17:00,4,17
7,7,9988021,580560515_7732621733,781255054_21935453173_531117,start,2017-04-17 15:16:00,4,17
8,8,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:00,4,5
9,9,8320017,39393514_33118319366,960651974_70596002104_312201,step_3,2017-04-05 13:09:00,4,5


In [37]:
file['Month'].unique()

array([4, 3])

In [34]:
#filter by a particular day 
file.query('Day==10')

Unnamed: 0,web_id,client_id,visitor_id,visit_id,process_step,date_time,Month,Day
67,67,2519971,121616091_57089151674,428599802_30402877972_724989,confirm,2017-04-10 16:21:00,4,10
68,68,2519971,121616091_57089151674,428599802_30402877972_724989,confirm,2017-04-10 16:19:00,4,10
635,635,7276973,627003452_19249388085,117895119_16501051592_422436,step_3,2017-04-10 19:00:00,4,10
636,636,7276973,627003452_19249388085,117895119_16501051592_422436,step_2,2017-04-10 18:58:00,4,10
637,637,7276973,627003452_19249388085,117895119_16501051592_422436,step_1,2017-04-10 18:57:00,4,10
...,...,...,...,...,...,...,...,...
342699,342699,6218213,46682116_29129401247,109794854_38827776110_737663,step_2,2017-04-10 13:57:00,4,10
342700,342700,6218213,46682116_29129401247,109794854_38827776110_737663,step_1,2017-04-10 13:57:00,4,10
342701,342701,6218213,46682116_29129401247,109794854_38827776110_737663,start,2017-04-10 13:57:00,4,10
342702,342702,5761946,430669178_36342190495,57127250_50441598218_373661,step_1,2017-04-10 13:17:00,4,10


In [None]:
#challenge - filter the df for saturdays





In [38]:
file.loc[file['date_time'].dt.weekday == 5]

Unnamed: 0,web_id,client_id,visitor_id,visit_id,process_step,date_time,Month,Day
354,354,2709011,664541274_97144910167,235162568_43115336410_822881,start,2017-04-08 12:33:00,4,8
355,355,9900469,210215964_20090640456,3461145_4721576513_877614,start,2017-04-01 19:02:00,4,1
357,357,6229916,483835308_26502657270,230630559_77569449276_549196,confirm,2017-04-01 16:29:00,4,1
358,358,6229916,483835308_26502657270,230630559_77569449276_549196,confirm,2017-04-01 16:27:00,4,1
366,366,4773291,587236617_20429853618,504295674_94485333270_228899,start,2017-04-08 04:49:00,4,8
...,...,...,...,...,...,...,...,...
342864,342864,6820024,415995398_21642161061,917003829_36953216287_21421,start,2017-03-18 12:27:00,3,18
342865,342865,6820024,415995398_21642161061,917003829_36953216287_21421,step_1,2017-03-18 12:26:00,3,18
342866,342866,6820024,415995398_21642161061,917003829_36953216287_21421,start,2017-03-18 12:25:00,3,18
342923,342923,127202,575836768_6553168853,786607636_58917493275_576382,start,2017-04-08 12:36:00,4,8


### Relative time 

In [39]:
import time
from datetime import date
today = date.today()
today

datetime.date(2021, 3, 25)

In [40]:
#your local time 
time.localtime(time.time())

time.struct_time(tm_year=2021, tm_mon=3, tm_mday=25, tm_hour=10, tm_min=10, tm_sec=13, tm_wday=3, tm_yday=84, tm_isdst=0)

In [41]:
#current time in london
time.gmtime(time.time())

time.struct_time(tm_year=2021, tm_mon=3, tm_mday=25, tm_hour=9, tm_min=11, tm_sec=9, tm_wday=3, tm_yday=84, tm_isdst=0)

# String functions

In [None]:

# https://docs.python.org/2.5/lib/string-methods.html

In [42]:
string = " I am learning  data  analysis at Ironhack  . It is  the BEST! "


In [43]:
string.lower()

' i am learning  data  analysis at ironhack  . it is  the best! '

In [44]:
string.upper()

' I AM LEARNING  DATA  ANALYSIS AT IRONHACK  . IT IS  THE BEST! '

In [45]:
#we can identify numbers in strings 
'34'.isdigit() # hint: this does not work with decimal numbers

True

In [47]:
subjects='data science, coding  , python, visualisation '

In [48]:
#using a basic split around the comma 
subjects.split(', ')

['data science', 'coding  ', 'python', 'visualisation ']

In [49]:
#combine split on column with trim of whitespace
pieces = [x.strip() for x in subjects.split(',')]
pieces

['data science', 'coding', 'python', 'visualisation']

In [50]:
#keyword search 
'data science' in subjects

True

In [51]:
str='data scientists too'

In [52]:
#isolate part of a string using index 
str[5:-4]

'scientists'

In [53]:
#replace
'its hard playing with strings'.replace('hard','brilliant')

'its brilliant playing with strings'

In [58]:
#use left strip to remove characters 
text="..ssshhh..,,,,aaaahsdata"
x = text.lstrip(".,ash")
x

'data'

In [55]:
#use left strip to remove leading white space
text2="     whoami?"
y= text2.lstrip()
y

'whoami?'

In [60]:
#substring count 
my_string = "How many fruits do you have in your fruit basket?"
my_string.count("fruit")

2