## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

# now you can import normally from sklearn.impute

from sklearn.preprocessing import OrdinalEncoder

In [2]:
path = r'C:\Users\Shibbs\Desktop\Praxis\CAPP\cwd\datasets\combined_missing.csv'

In [3]:
df = pd.read_csv(path)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#df.category_code=df.category_code.astype(str)

## Split category code before imputing.

After imputation it may so happen that category_code (original) may not be same as splitted category code L1 and L2.

We will again split the original category_code and check which splitting gives better results.

In [None]:
df["category_code_L1"] = df["category_code"].str.split(".",expand=True)[0]
df["category_code_L2"] = df["category_code"].str.split(".",expand=True)[1]


In [None]:
#df = df.drop(cat_cols,axis=1)

In [4]:
df = df.drop(['Unnamed: 0.1','Unnamed: 0'],axis=1)

In [5]:
df.isnull().sum()

event_time            0
event_type            0
product_id            0
category_id           0
category_code    143865
brand            212364
price                 0
user_id               0
user_session        165
dtype: int64

### Encode using Ordinal Encoder

In [6]:
#instantiate ordinal encoder
encoder = OrdinalEncoder()

# create a list of categorical columns to iterate over
cat_cols = ['brand', 'category_code']

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    
    #retains only non-null values
    nonulls = np.array(data.dropna())
    
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape).astype(int)
    
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    
    return data

#create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(df[columns])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Copy encoded data to another dataframe for future use

In [10]:
df1 = df[['brand', 'category_code']]

In [None]:
#df.category_id=df.category_id.astype(object)

### Impute using Iterative imputer sklearn

In [None]:
mice_impute = IterativeImputer()

In [None]:
# impute data and convert 
encode_data = pd.DataFrame(np.round(mice_impute.fit_transform(df[cat_cols])),columns = cat_cols )

### Impute using CatBoost Iterative imputer sklearn

In [7]:
from catboost import CatBoostClassifier

In [26]:
imputer = CatBoostClassifier()

In [28]:
imp = IterativeImputer(estimator=imputer, max_iter=20, verbose=2)
brand = imp.fit_transform(np.array(df1[['brand','category_code']]))



[IterativeImputer] Completing matrix with shape (885124, 2)
Learning rate set to 0.110839


KeyboardInterrupt: 

In [None]:
# encode_data.to_csv(r'C:\Users\Shibbs\Desktop\Praxis\CAPP\cwd\datasets\encknn.csv')

In [24]:
np.array(df1[['brand','category_code']]).shape

(885124, 2)

In [15]:
category.reshape(1, -1)

array([[ 60.,  60.,  60., ..., 168., 168., 122.]])

In [19]:
category = np.transpose(category)

In [21]:
category.shape

(1, 885124)

In [20]:
cat = pd.Series(category)

ValueError: Data must be 1-dimensional

In [31]:
brand.reshape(-1,1)

array([[265.],
       [265.],
       [265.],
       ...,
       [963.],
       [963.],
       [964.]])

In [32]:
df3 = pd.Series({'category_code':category, 'brand':brand})

In [33]:
df3.isnull().sum()

0

In [34]:
df3

category_code    [[60.0], [60.0], [60.0], [60.0], [60.0], [60.0...
brand            [[265.0], [265.0], [265.0], [265.0], [265.0], ...
dtype: object

# 

In [16]:
df = df.drop(cat_cols,axis=1)

In [17]:
df = pd.concat([df,df3],axis=1)

In [18]:
df.isnull().sum()

event_time           2
event_type           2
product_id           2
category_id          2
price                2
user_id              2
user_session       167
0               885124
dtype: int64

In [19]:
df

Unnamed: 0,event_time,event_type,product_id,category_id,price,user_id,user_session,0
0,2020-11-20 15:19:47 UTC,view,1642613.0,2.144416e+18,25.56,1.515916e+18,9cdf5Yzi4W,
1,2020-10-18 18:29:34 UTC,view,1642613.0,2.144416e+18,25.56,1.515916e+18,DomDtlfN2x,
2,2020-11-20 19:18:47 UTC,view,1642613.0,2.144416e+18,25.56,1.515916e+18,N5U9nr75XP,
3,2020-12-20 01:25:28 UTC,view,1642613.0,2.144416e+18,25.56,1.515916e+18,apG36IgOcf,
4,2021-02-24 15:59:50 UTC,view,1642613.0,2.144416e+18,25.56,1.515916e+18,7CZ3WuPoU3,
...,...,...,...,...,...,...,...,...
885121,2020-10-08 12:10:00 UTC,view,864264.0,2.144420e+18,20.63,1.515920e+18,kQHdSa7WoM,
885122,2020-10-08 12:10:02 UTC,view,864264.0,2.144420e+18,20.63,1.515920e+18,GAylf4yJHR,
885123,2020-12-31 06:47:03 UTC,view,703481.0,2.144420e+18,29.86,1.515920e+18,iyfJWibfdp,
brand,,,,,,,,"[[265.0], [265.0], [265.0], [265.0], [265.0], ..."


In [None]:
df.to_csv(r'C:\Users\Shibbs\Desktop\Praxis\CAPP\cwd\datasets\catboost_0.csv')

In [None]:
df1