# Aim
Creating pipelines for data cleaning, preprocessing and feature engineering. This input to this pipeline will be raw data. The output will be preprocessed data ready for modeling.

### Principles of tidy data:    
* Each variable forms a column
* Each observation forms a row
* Each type of observational unit forms a table

The first step to resolving messy data is to recognize it when it exists, and there are boundless 
possibilities. Hadley explicitly mentions five of the most common types of messy data:    
* Column names are values, not variable names
* Multiple variables are stored in column names
* Variables are stored in both rows and columns
* Multiple types of observational units are stored in the same table
* A single observational unit is stored in multiple tables

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import black

In [2]:
# read data   
calendar_df = pd.read_csv('../data/raw/calendar.csv')
sell_prices_df = pd.read_csv('../data/raw/sell_prices.csv')
sales_train_val_df = pd.read_csv('../data/raw/sales_train_validation.csv')
#submission_df = pd.read_csv('../data/raw/sample_submission.csv')

In [3]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [4]:
calendar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_name_1  162 non-null    object
 8   event_type_1  162 non-null    object
 9   event_name_2  5 non-null      object
 10  event_type_2  5 non-null      object
 11  snap_CA       1969 non-null   int64 
 12  snap_TX       1969 non-null   int64 
 13  snap_WI       1969 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 215.5+ KB


**Must**    
1. Convert data type of `date` column to datetime.
2. Replace NaN values with 'unknown'
3. Convert columns to nominal categorical data: weekday, month, year, event_name_1, event_type_1, event_name_2, event_type_2
4. drop one of the snap columns

**Optional**    
1. Convert datatypes of integer variables from int64 to int8 or int16 or int32

In [7]:
def change_date_datatpye(df, date_col):
    """change datatype of date column from object to datetime
    input: takes the dataframe and name of the date column
    ouput: changes the datatype to datetime"""
    
    df[date_col] = pd.to_datetime(df[date_col], dayfirst= False)   

In [9]:
change_date_datatpye(calendar_df, 'date')

In [10]:
calendar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int64         
 2   weekday       1969 non-null   object        
 3   wday          1969 non-null   int64         
 4   month         1969 non-null   int64         
 5   year          1969 non-null   int64         
 6   d             1969 non-null   object        
 7   event_name_1  162 non-null    object        
 8   event_type_1  162 non-null    object        
 9   event_name_2  5 non-null      object        
 10  event_type_2  5 non-null      object        
 11  snap_CA       1969 non-null   int64         
 12  snap_TX       1969 non-null   int64         
 13  snap_WI       1969 non-null   int64         
dtypes: datetime64[ns](1), int64(7), object(6)
memory usage: 215.5+ KB


In [11]:
from sklearn.preprocessing import LabelEncoder

In [15]:
class_le = LabelEncoder()
calendar_df['weekday'] = class_le.fit_transform(calendar_df['weekday'])
calendar_df['weekday'].head(10)

0    2
1    3
2    1
3    5
4    6
5    4
6    0
7    2
8    3
9    1
Name: weekday, dtype: int64

In [17]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,2,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,3,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,1,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,5,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,6,5,2,2011,d_5,,,,,1,0,1


In [18]:
calendar_df['event_name_1'].value_counts()

NBAFinalsEnd           6
NBAFinalsStart         6
Purim End              6
StPatricksDay          6
ValentinesDay          6
SuperBowl              6
LentStart              6
LentWeek2              6
PresidentsDay          6
Pesach End             6
Mother's day           6
Ramadan starts         6
MemorialDay            6
Easter                 5
LaborDay               5
OrthodoxChristmas      5
ColumbusDay            5
Cinco De Mayo          5
EidAlAdha              5
Eid al-Fitr            5
VeteransDay            5
Chanukah End           5
Christmas              5
NewYear                5
IndependenceDay        5
Halloween              5
Thanksgiving           5
OrthodoxEaster         5
MartinLutherKingDay    5
Father's day           4
Name: event_name_1, dtype: int64

In [19]:
calendar_df.isnull().sum()

date               0
wm_yr_wk           0
weekday            0
wday               0
month              0
year               0
d                  0
event_name_1    1807
event_type_1    1807
event_name_2    1964
event_type_2    1964
snap_CA            0
snap_TX            0
snap_WI            0
dtype: int64

In [None]:
pd.value_counts

In [21]:
from sklearn.impute import SimpleImputer

In [22]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,2,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,3,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,1,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,5,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,6,5,2,2011,d_5,,,,,1,0,1


In [33]:
# https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.Series.reshape.html#pandas.Series.reshape
imp_unkn = SimpleImputer(strategy = 'constant', fill_value= 'unknown')
calendar_df['event_name_1'] = imp_unkn.fit_transform(calendar_df['event_name_1'].values.reshape(-1, 1))

In [None]:
calendar_df['Age'].fillna(x['Age'].mean(), inplace=True)

In [34]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,2,1,1,2011,d_1,unknown,,,,0,0,0
1,2011-01-30,11101,3,2,1,2011,d_2,unknown,,,,0,0,0
2,2011-01-31,11101,1,3,1,2011,d_3,unknown,,,,0,0,0
3,2011-02-01,11101,5,4,2,2011,d_4,unknown,,,,1,1,0
4,2011-02-02,11101,6,5,2,2011,d_5,unknown,,,,1,0,1


In [35]:
d= {'val' : [10, 20 , 30]}
df = pd.DataFrame(d)
df

Unnamed: 0,val
0,10
1,20
2,30


In [37]:
df.apply(lambda x: x+100)

Unnamed: 0,val
0,110
1,120
2,130


In [38]:
# remove d_ from the d column and store the int value in the days column
calendar_df["days"] = calendar_df["d"].apply(lambda x: int(x.split("_")[1]))

In [39]:
calendar_df.columns

Index(['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'd',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'days'],
      dtype='object')

In [None]:
# list of columns to drop
columns_to_drop = []

In [40]:
# pipeline https://stackoverflow.com/questions/57059571/how-to-use-simpleimputer-class-to-impute-missing-values-in-different-columns-wit

In [None]:
import os
import requests
import pandas as pd
from tqdm import tqdm

cwd = os.getcwd()
books = pd.read_excel(os.path.join(cwd,'Springer.xlsx'))
print('Download started.')

for url, title, author, pk_name in tqdm(books[['OpenURL', 'Book Title', 'Author', 'English Package Name']].values):

  r = requests.get(url)
  new_url = r.url

  new_url = new_url.replace('/book/','/content/pdf/')
  new_url = new_url.replace('%2F','/')
  new_url = new_url + '.pdf'

  final = new_url.split('/')[-1]
  final = title.replace(',','-').replace('.','').replace('/',' ') + '__' + author.replace(', ','+').replace('.','').replace('/',' ') + '.pdf'

  dir = os.path.join(cwd,pk_name)
  if not os.path.exists(dir):
    os.mkdir(dir)

  myfile = requests.get(new_url, allow_redirects=True)
  open(os.path.join(dir,final), 'wb').write(myfile.content)

print('Download finished.')