# Import

In [2]:
from datetime import datetime
from datetime import timedelta

import pandas as pd

# Load Data

In [3]:
file_path='https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/ratings.dat'
col_names=['user_id', 'movie_id', 'rating', 'timestamp']

In [4]:
ratings=pd.read_csv(file_path, sep='::', header=None, names=col_names,
                   encoding='cp1252', engine='python')

In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Add Derived Variable 'dt'
* in ratings DataFrame
* Add derived variable 'dt' : value that changed from timestamp to datetime object

In [6]:
dates=[]
for ts in ratings['timestamp']:
    dates.append(datetime.fromtimestamp(ts))

# list dates > pd.Series
ratings['dt']=pd.Series(dates)

In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,dt
0,1,1193,5,978300760,2001-01-01 07:12:40
1,1,661,3,978302109,2001-01-01 07:35:09
2,1,914,3,978301968,2001-01-01 07:32:48
3,1,3408,4,978300275,2001-01-01 07:04:35
4,1,2355,5,978824291,2001-01-07 08:38:11


In [8]:
ratings.tail()

Unnamed: 0,user_id,movie_id,rating,timestamp,dt
1000204,6040,1091,1,956716541,2000-04-26 11:35:41
1000205,6040,1094,5,956704887,2000-04-26 08:21:27
1000206,6040,562,5,956704746,2000-04-26 08:19:06
1000207,6040,1096,4,956715648,2000-04-26 11:20:48
1000208,6040,1097,4,956715569,2000-04-26 11:19:29


In [9]:
ratings['dt'].describe()

  ratings['dt'].describe()


count                 1000209
unique                 458455
top       2000-11-30 05:06:42
freq                       30
first     2000-04-26 08:05:32
last      2003-03-01 02:49:50
Name: dt, dtype: object

# using pd.to_datetime() function
* **`pd.to_datetime()` function** : generate Series of datetime object using datetime.strptime() or datetime.fromtimestamp()method

In [10]:
ratings['dt2']=pd.to_datetime(ratings['timestamp'], unit='s')

In [11]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,dt,dt2
0,1,1193,5,978300760,2001-01-01 07:12:40,2000-12-31 22:12:40
1,1,661,3,978302109,2001-01-01 07:35:09,2000-12-31 22:35:09
2,1,914,3,978301968,2001-01-01 07:32:48,2000-12-31 22:32:48
3,1,3408,4,978300275,2001-01-01 07:04:35,2000-12-31 22:04:35
4,1,2355,5,978824291,2001-01-07 08:38:11,2001-01-06 23:38:11


In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   user_id    1000209 non-null  int64         
 1   movie_id   1000209 non-null  int64         
 2   rating     1000209 non-null  int64         
 3   timestamp  1000209 non-null  int64         
 4   dt         1000209 non-null  datetime64[ns]
 5   dt2        1000209 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(4)
memory usage: 45.8 MB


# Change timestamp to datetime
* for easy extraction of year/month/date information

In [13]:
ratings['year']=ratings['dt'].dt.year
ratings['month']=ratings['dt'].dt.month
ratings['quarter']=ratings['dt'].dt.quarter

In [14]:
ratings.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,dt,dt2,year,month,quarter
0,1,1193,5,978300760,2001-01-01 07:12:40,2000-12-31 22:12:40,2001,1,1
1,1,661,3,978302109,2001-01-01 07:35:09,2000-12-31 22:35:09,2001,1,1
2,1,914,3,978301968,2001-01-01 07:32:48,2000-12-31 22:32:48,2001,1,1
3,1,3408,4,978300275,2001-01-01 07:04:35,2000-12-31 22:04:35,2001,1,1
4,1,2355,5,978824291,2001-01-07 08:38:11,2001-01-06 23:38:11,2001,1,1
5,1,1197,3,978302268,2001-01-01 07:37:48,2000-12-31 22:37:48,2001,1,1
6,1,1287,5,978302039,2001-01-01 07:33:59,2000-12-31 22:33:59,2001,1,1
7,1,2804,5,978300719,2001-01-01 07:11:59,2000-12-31 22:11:59,2001,1,1
8,1,594,4,978302268,2001-01-01 07:37:48,2000-12-31 22:37:48,2001,1,1
9,1,919,4,978301368,2001-01-01 07:22:48,2000-12-31 22:22:48,2001,1,1


## in ratings DF, extract 2003-1Q data

In [15]:
ratings[(ratings['year']==2003)&ratings['quarter']==1]

Unnamed: 0,user_id,movie_id,rating,timestamp,dt,dt2,year,month,quarter
8696,59,1175,4,1041967123,2003-01-08 04:18:43,2003-01-07 19:18:43,2003,1,1
8700,59,1252,4,1041967475,2003-01-08 04:24:35,2003-01-07 19:24:35,2003,1,1
8702,59,1183,4,1041963129,2003-01-08 03:12:09,2003-01-07 18:12:09,2003,1,1
8704,59,2997,4,1041962568,2003-01-08 03:02:48,2003-01-07 18:02:48,2003,1,1
8706,59,3871,4,1041968282,2003-01-08 04:38:02,2003-01-07 19:38:02,2003,1,1
...,...,...,...,...,...,...,...,...,...
984842,5950,3317,3,1046369439,2003-02-28 03:10:39,2003-02-27 18:10:39,2003,2,1
984847,5950,3328,3,1046369090,2003-02-28 03:04:50,2003-02-27 18:04:50,2003,2,1
984849,5950,111,5,1046368241,2003-02-28 02:50:41,2003-02-27 17:50:41,2003,2,1
984861,5950,3363,5,1046367948,2003-02-28 02:45:48,2003-02-27 17:45:48,2003,2,1


In [16]:
ratings[(ratings['dt'].dt.year==2003)&(ratings['dt'].dt.quarter==1)]

Unnamed: 0,user_id,movie_id,rating,timestamp,dt,dt2,year,month,quarter
8696,59,1175,4,1041967123,2003-01-08 04:18:43,2003-01-07 19:18:43,2003,1,1
8700,59,1252,4,1041967475,2003-01-08 04:24:35,2003-01-07 19:24:35,2003,1,1
8702,59,1183,4,1041963129,2003-01-08 03:12:09,2003-01-07 18:12:09,2003,1,1
8704,59,2997,4,1041962568,2003-01-08 03:02:48,2003-01-07 18:02:48,2003,1,1
8706,59,3871,4,1041968282,2003-01-08 04:38:02,2003-01-07 19:38:02,2003,1,1
...,...,...,...,...,...,...,...,...,...
984842,5950,3317,3,1046369439,2003-02-28 03:10:39,2003-02-27 18:10:39,2003,2,1
984847,5950,3328,3,1046369090,2003-02-28 03:04:50,2003-02-27 18:04:50,2003,2,1
984849,5950,111,5,1046368241,2003-02-28 02:50:41,2003-02-27 17:50:41,2003,2,1
984861,5950,3363,5,1046367948,2003-02-28 02:45:48,2003-02-27 17:45:48,2003,2,1
