In [None]:
#Mounting the drive to the colab workspace.
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

# Notebook 2: Preprocessing the CSV file:
**Overview of the notebook**
* The 'prepareDatasetCSV.ipynb' produces the csv files for click and impressions logs stored as '*clk_logs.csv*' and '*imp_logs.csv*' in the folder TRAIN.
* This notebook walks through the process of some data preprocessing performed on those csv files.
* This notebook produces seperate training dataset csv files for all the advertisers involved, doing so will help with statistical analysis and is efficient in terms of memory.
* The code implemented here is made into a library [preprocessCSV.py](https://drive.google.com/file/d/1tNcT-mDV672PSZNqgd2iQ_e6OyDkW0i2/view?usp=sharing) for later use.


In [None]:
path_clk = 'TRAIN/clk_logs.csv'
path_imp = 'TRAIN/imp_logs.csv'

In [None]:
#Loading the dependancies.
import os
import pandas as pd
from tqdm import tqdm

In [None]:
#Utilities.
def get_chunks(in_path,out_path,chunk_size=100000,verbose=False):
    '''
    A method that splits up the csv file in to many chunks of
    smaller size.
    '''
    if len(os.listdir(out_path)) != 0:
        print(f'Wow! Chunks have already been created!!!')
        return
    chunk_num =1
    for chunk in pd.read_csv(path_imp,chunksize=chunk_size):
        chunk.to_csv(out_path+'chunk'+str(chunk_num)+'.csv',index=False)
        if verbose:
            print(f'Number of chunks created: {chunk_num}')
        chunk_num+=1
    print(f'csv file is made in to small chunks and is stored at:{out_path}')



In [None]:
#Navigating to main directory.
root_dir = r'/content/drive/My Drive/HS4007/Real_Time_Bidding'
os.chdir(root_dir)
#Sanity Check.
!pwd

/content/drive/My Drive/HS4007/Real_Time_Bidding


**Loading "clk_logs.csv" as a pandas dataframe**

In [None]:
clk_df = pd.read_csv(path_clk)
clk_df.tail(2)

NameError: ignored

***
* The above cell provides an overview of the different features that can be exploited for our studies.



**List of advertisers**
* Each advertiser will have a unique advertiser_id.


In [None]:
advertisers = clk_df['advertiser_id'].unique()
print(f'The advertiser_id of different advertisers involved are{advertisers}')

The advertiser_id of different advertisers involved are[3476 3358 3386 3427 1458]


Therefore there are five different advertisers in the dataset.

**Loading the "imp_logs.csv" as pandas dataframe**

* Since the dataframe is of large size, its better to read them in chunks.
* This process will split up the 'imp_logs.csv' into many csv files each of size equal to the chunksize and are stored at 'TRAIN/imp_chunks' which can be looked at for getting a sense of the dataset.

In [None]:
path_to_chunks = 'TRAIN/imp_chunks/'
get_chunks(path_imp,path_to_chunks,verbose=True)

Wow! Chunks have already been created!!!


## Preprocessing Steps Illustrated
* All the steps of preprocessing will be illustrated with a small chunk of imp_logs.csv and is called "toy_imp".


Initially the datafrmae looked like the follows,

In [None]:
toy_imp = pd.read_csv(path_to_chunks+'chunk1.csv')
toy_imp.head(2)

Unnamed: 0,bid_id,timestamp,log_type,ipinyou_id,user_agent,ip_address,region_id,city_id,ad_exchange,domain,url,anonymous_url_id,ad_slot_id,ad_slot_width,ad_slot_height,ad_slot_visibility,ad_slot,ad_slot_floor_price,creative_id,bidding_price,paying_price,key_page_url,advertiser_id,user_tags
0,8a15b98c8f9e60d4f92aaab01acf52a4,20130606000104192,1,VhTVORqG36N6qMj,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,114.100.37.*,106,117,1,lsxSl559Xql7FmMs,8c9742e63497713b97ac7e780a8f9a12,,mm_30232185_2681382_11190685,950,90,0,1,0,23d6dade7ed21cea308205b37594003e,227,207,b2e35064f3549d447edbbdfb1f707c8c,3427,"10063,10684,10083,13403,10059,10024,10048,1005..."
1,5bd0cbeb2f82fb94e56b7dc2e6b77ec,20130606000104252,1,Vhkr1uaROqKsXmb,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,222.220.35.*,308,320,2,eSMvBpa0jqmUagk4JKTI,15b56f50bbaa689c91ccaf5ed7c97b48,,3521017083,336,280,2,0,5,13606a7c541dcd9ca1948875a760bb31,238,72,d29e59bf0f7f8243858b8183f14d4412,3358,1380010024


As can be seen they have similar columns as clk_df. The columns are as follows,

In [None]:
clk_df.columns

Index(['bid_id', 'timestamp', 'log_type', 'ipinyou_id', 'user_agent',
       'ip_address', 'region_id', 'city_id', 'ad_exchange', 'domain', 'url',
       'anonymous_url_id', 'ad_slot_id', 'ad_slot_width', 'ad_slot_height',
       'ad_slot_visibility', 'ad_slot', 'ad_slot_floor_price', 'creative_id',
       'bidding_price', 'paying_price', 'key_page_url', 'advertiser_id',
       'user_tags'],
      dtype='object')

---
### Step 0: Creating the click variable which is a boolean.

* This is an important variable as it tells us whether a given impression resulted in a click or not.



In [None]:
def create_click_col(df1,df2):
    '''
    A method that creates click boolean column given impressions (df1) and
    clicks dataframe (df2).
    '''
    click = df1['bid_id'].isin(df2['bid_id'])
    df1['click'] = click
    #Dropping the log_type and bid_id column as it is useless from now on.
    df1.drop(columns=['bid_id','log_type'],axis=1,inplace=True)
    #Dropping nan values of click.
    df1.dropna(subset=['click'], inplace=True)
    #Reordering columns.
    cols = ['click'] + [col for col in df1 if col != 'click']
    df1=df1[cols]
    return df1

In [None]:
toy_imp = create_click_col(toy_imp,clk_df)
toy_imp.head(3)

Unnamed: 0,click,timestamp,ipinyou_id,user_agent,ip_address,region_id,city_id,ad_exchange,domain,url,anonymous_url_id,ad_slot_id,ad_slot_width,ad_slot_height,ad_slot_visibility,ad_slot,ad_slot_floor_price,creative_id,bidding_price,paying_price,key_page_url,advertiser_id,user_tags
0,False,20130606000104192,VhTVORqG36N6qMj,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,114.100.37.*,106,117,1,lsxSl559Xql7FmMs,8c9742e63497713b97ac7e780a8f9a12,,mm_30232185_2681382_11190685,950,90,0,1,0,23d6dade7ed21cea308205b37594003e,227,207,b2e35064f3549d447edbbdfb1f707c8c,3427,"10063,10684,10083,13403,10059,10024,10048,1005..."
1,False,20130606000104252,Vhkr1uaROqKsXmb,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,222.220.35.*,308,320,2,eSMvBpa0jqmUagk4JKTI,15b56f50bbaa689c91ccaf5ed7c97b48,,3521017083,336,280,2,0,5,13606a7c541dcd9ca1948875a760bb31,238,72,d29e59bf0f7f8243858b8183f14d4412,3358,1380010024
2,False,20130606000104253,VhL01pk8OTkW3Mc,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,58.100.240.*,94,95,1,tK1NTu1YP5scFsf,e22930480589abcc1468854cb3403314,,mm_10075660_3500949_11453278,950,90,0,1,0,d5cecca9a6cbd7a0a48110f1306b26d1,227,108,d29e59bf0f7f8243858b8183f14d4412,3358,10059138661006310111


### Step 1: Getting the weekday and hour of the day.

* 'timestamp' feature is modified to get the day and time as features.

In [None]:
def get_date(df):
    df['timestamp']= df['timestamp'].astype(str)
    df['timestamp'] =df.apply(lambda x: x['timestamp'][:-3], axis = 1)
    df['timestamp'] = pd.to_datetime(df['timestamp'],format='%Y%m%d%H%M%S')
    df['hour'] = df['timestamp'].dt.hour
    df['day']=df['timestamp'].dt.dayofweek
    df.drop(columns=['timestamp'],axis=1,inplace=True)
    cols=['day','hour','click'] + [col for col in df if col not in ['day','hour','click']]
    df=df[cols]
    return df

In [None]:
toy_imp=get_date(toy_imp)
toy_imp.head()

Unnamed: 0,day,hour,click,ipinyou_id,user_agent,ip_address,region_id,city_id,ad_exchange,domain,url,anonymous_url_id,ad_slot_id,ad_slot_width,ad_slot_height,ad_slot_visibility,ad_slot,ad_slot_floor_price,creative_id,bidding_price,paying_price,key_page_url,advertiser_id,user_tags
0,3,0,False,VhTVORqG36N6qMj,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,114.100.37.*,106,117,1,lsxSl559Xql7FmMs,8c9742e63497713b97ac7e780a8f9a12,,mm_30232185_2681382_11190685,950,90,0,1,0,23d6dade7ed21cea308205b37594003e,227,207,b2e35064f3549d447edbbdfb1f707c8c,3427,"10063,10684,10083,13403,10059,10024,10048,1005..."
1,3,0,False,Vhkr1uaROqKsXmb,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,222.220.35.*,308,320,2,eSMvBpa0jqmUagk4JKTI,15b56f50bbaa689c91ccaf5ed7c97b48,,3521017083,336,280,2,0,5,13606a7c541dcd9ca1948875a760bb31,238,72,d29e59bf0f7f8243858b8183f14d4412,3358,1380010024
2,3,0,False,VhL01pk8OTkW3Mc,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,58.100.240.*,94,95,1,tK1NTu1YP5scFsf,e22930480589abcc1468854cb3403314,,mm_10075660_3500949_11453278,950,90,0,1,0,d5cecca9a6cbd7a0a48110f1306b26d1,227,108,d29e59bf0f7f8243858b8183f14d4412,3358,10059138661006310111
3,3,0,False,VhT3La5uDlaywOj,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,122.233.40.*,94,95,1,trqRTummPvas1m58uG,762a1aa930e6f41c969e77226c62f3a7,,mm_10058411_2865562_9929053,300,250,2,1,0,44966cc8da1ed40c95d59e863c8c75f0,300,81,361e128affece850342293213691a043,3386,100571006310024138001386610110
4,3,0,False,VhkSPnNDP8L8eYl,Opera/9.80 (Android; Opera Mini/7.7.33548/29.3...,58.67.157.*,216,217,1,trqRTuS8jZL7FmMs,1eb132233cd79996f8abdd03e2de6699,,mm_40468498_3493773_11415894,300,250,0,5,0,d881a6c788e76c2c27ed1ef04f119544,227,89,d29e59bf0f7f8243858b8183f14d4412,3358,1386610111


***
### Step 2: Dropping the unwated columns.

As we are in the buisness of finding patterns in data, any column which is unique for user can be safely removed. Based on this and other factors, columns to be dropped are,
***
Column Name | Reason to be dropped
 ---|---
ipinyou_id & user_id & ad_slot_id | It's an internal user id and can be safely ignored as it is unque for every user and hence cannot be used for classification.
anonymous_url_id & creative_id| Higly specialised for user and ad. Also, effects are set in place only after click.
ip_adress | Can be used for location but would result in large number of categories as it is unique for every user.
city_id | As it is all collected from china alone, using this data doesn't make sense for a generalized model. Also, this information is encoded in region_id.
domain & URL | Both of this are hashed values corresponding to the hosting webpage of the ad slot. Is useless for the task where only the clicks matter. Whatever effect this has will all be after the user clicks the link.





In [None]:
cols_to_be_dropped=['ipinyou_id','ip_address','city_id','domain','url',
                    'anonymous_url_id','ad_slot_id','creative_id','key_page_url','user_tags']

toy_imp.drop(columns=cols_to_be_dropped,inplace=True)

In [None]:
toy_imp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   day                  100000 non-null  int64 
 1   hour                 100000 non-null  int64 
 2   click                100000 non-null  bool  
 3   user_agent           99960 non-null   object
 4   region_id            100000 non-null  int64 
 5   ad_exchange          100000 non-null  int64 
 6   ad_slot_width        100000 non-null  int64 
 7   ad_slot_height       100000 non-null  int64 
 8   ad_slot_visibility   100000 non-null  int64 
 9   ad_slot              100000 non-null  int64 
 10  ad_slot_floor_price  100000 non-null  int64 
 11  bidding_price        100000 non-null  int64 
 12  paying_price         100000 non-null  int64 
 13  advertiser_id        100000 non-null  int64 
dtypes: bool(1), int64(12), object(1)
memory usage: 10.8+ MB


***
### Step 3: Determining what kind of operating system and browser the user is using.

* This is a categorical data

In [None]:
class user_agent(object):
    def __init__(self,df):
        self.df = df
        self.df.rename(columns={'user_agent':'browser'},inplace=True)
        self.df['browser'] = df['browser'].astype(str)
        self.cols = None

    def map_browser(self,agent):
        browsers = ['edge', 'trident', 'chrome', 'firefox', 'safari', 'opera']
        for browser in browsers:
            if browser in agent.lower():
                return 'ie' if browser == 'trident' else browser
        return 'other'
    
    def map_os(self,agent):
        os_list = ['windows', 'linux', 'mac os x']
        for os in os_list:
            if os in agent.lower():
                return os
        return 'other'
    
    def reorder_cols(self,target_index):
        cols = [col for col in self.df]
        cols[target_index],cols[-1] = cols[-1],cols[target_index]
        self.df = self.df[cols]
        return self.df


    def create_cols(self):
        self.df['os'] = self.df['browser'].map(lambda x: self.map_os(x), na_action=None)
        self.df['os'] = self.df['os'].astype('category')
        self.df['browser'] = self.df['browser'].map(lambda x: self.map_browser(x), na_action=None)
        self.df['browser'] = self.df['browser'].astype('category')
        self.df = self.reorder_cols(4)
        return self.df



In [None]:
c= user_agent(toy_imp)
toy_imp = c.create_cols()

In [None]:
toy_imp.tail(3)

Unnamed: 0,day,hour,click,browser,os,ad_exchange,ad_slot_width,ad_slot_height,ad_slot_visibility,ad_slot,ad_slot_floor_price,bidding_price,paying_price,advertiser_id,region_id
99997,3,1,False,chrome,windows,2,300,250,1,0,5,249,120,3476,124
99998,3,1,False,ie,windows,2,336,280,2,0,4,249,42,3476,124
99999,3,1,False,ie,windows,3,728,90,0,0,20,300,20,3386,216


In [None]:
toy_imp['region_id'].value_counts()

In [None]:
df['browser'].value_counts()

In [None]:
toy_imp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   day                  100000 non-null  int64   
 1   hour                 100000 non-null  int64   
 2   click                100000 non-null  bool    
 3   browser              100000 non-null  category
 4   os                   100000 non-null  category
 5   ad_exchange          100000 non-null  int64   
 6   ad_slot_width        100000 non-null  int64   
 7   ad_slot_height       100000 non-null  int64   
 8   ad_slot_visibility   100000 non-null  int64   
 9   ad_slot              100000 non-null  int64   
 10  ad_slot_floor_price  100000 non-null  int64   
 11  bidding_price        100000 non-null  int64   
 12  paying_price         100000 non-null  int64   
 13  advertiser_id        100000 non-null  int64   
 14  region_id            100000 non-null  int64   
dtypes

In [None]:
toy_imp.columns

Index(['click', 'day', 'hour', 'browser', 'os', 'ad_exchange', 'ad_slot_width',
       'ad_slot_height', 'ad_slot_visibility', 'ad_slot',
       'ad_slot_floor_price', 'bidding_price', 'paying_price', 'advertiser_id',
       'region_id'],
      dtype='object')

***
### Step 4: Conversion to Numeric Features.

* bidding_price is the price bid from the ipinyou for the bid request and paying_price is the actual price paid.
* This won't need any modification as we are only using impression logs.
* Along with this two columns, the list of columns that needs to used as integer are,

In [None]:
int_cols=['ad_slot_width','ad_slot_height','ad_slot_floor_price', 'bidding_price', 'paying_price']

def numerise_features(df,int_cols=['ad_slot_width','ad_slot_height','ad_slot_floor_price', 'bidding_price', 'paying_price']):
    for feature in int_cols:
        df[feature] = df[feature].astype('int32')
    return df

In [None]:
toy_imp=numerise_features(toy_imp)


In [None]:
toy_imp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   click                100000 non-null  bool    
 1   day                  100000 non-null  int64   
 2   hour                 100000 non-null  int64   
 3   browser              100000 non-null  category
 4   os                   100000 non-null  category
 5   ad_exchange          100000 non-null  int64   
 6   ad_slot_width        100000 non-null  int32   
 7   ad_slot_height       100000 non-null  int32   
 8   ad_slot_visibility   100000 non-null  int64   
 9   ad_slot              100000 non-null  int64   
 10  ad_slot_floor_price  100000 non-null  int32   
 11  bidding_price        100000 non-null  int32   
 12  paying_price         100000 non-null  int32   
 13  advertiser_id        100000 non-null  int64   
 14  region_id            100000 non-null  int64   
dtypes

In [None]:
df['paying_price'].value_counts()

20     7066
80     3160
22     2871
50     2443
21     2263
       ... 
285       4
269       4
297       3
273       3
298       1
Name: paying_price, Length: 301, dtype: int64

In [None]:
df['bidding_price'].value_counts()

300    44762
238    21099
227    17609
241     6715
254     5071
249     4744
Name: bidding_price, dtype: int64

In [None]:
toy_imp.columns

Index(['click', 'day', 'hour', 'browser', 'os', 'ad_exchange', 'ad_slot_width',
       'ad_slot_height', 'ad_slot_visibility', 'ad_slot',
       'ad_slot_floor_price', 'bidding_price', 'paying_price', 'advertiser_id',
       'region_id'],
      dtype='object')


### Step 5: Conversion to Category features.

The features that will be categorized are,

Feature to be categorized | Reason
---|---
day | Can only be one of the seven days of a week.
time | Can only be one of 24 hours in a day.
ad_slot_visibility | Describes the times after which the ad was displayed before it was clicked.
ad_slot | Describes the format in which the ad was displayed, whether it was an pop-up window, background etc...
ad_exchange | This corresponds to the DSP the bid was placed from and hence can only will take categorical values.
advertiser_id | There are only a few advertisers.
region_id | Can only be a set of localites.


In [None]:
category_features = ['day', 'hour','ad_slot_visibility','ad_slot','ad_exchange']


def categorize_features(df,category_features = ['day','hour','ad_slot_visibility','ad_slot','advertiser_id','ad_exchange','region_id']):
    for feature in category_features:
        df[feature]=df[feature].astype('category')
    return df


In [None]:
toy_imp = categorize_features(toy_imp)
toy_imp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   click                100000 non-null  bool    
 1   day                  100000 non-null  category
 2   hour                 100000 non-null  category
 3   browser              100000 non-null  category
 4   os                   100000 non-null  category
 5   ad_exchange          100000 non-null  category
 6   ad_slot_width        100000 non-null  int32   
 7   ad_slot_height       100000 non-null  int32   
 8   ad_slot_visibility   100000 non-null  category
 9   ad_slot              100000 non-null  category
 10  ad_slot_floor_price  100000 non-null  int32   
 11  bidding_price        100000 non-null  int32   
 12  paying_price         100000 non-null  int32   
 13  advertiser_id        100000 non-null  category
 14  region_id            100000 non-null  category
dtypes

***
***

## Preprocessing the whole imp_logs

* Here we will apply steps 0-5 to the whole 'imp_logs.csv' and consequntly create the train dataset.

In [None]:
#Creating an iterator for imp_logs.
imp_iter = pd.read_csv(path_imp,chunksize=100000)

In [None]:
p = 'TRAIN/imp_chunks/'
def get_data(path_clk,path_imp,mode='Train'):
    cols_to_be_dropped=['ipinyou_id','ip_address','city_id','domain','url',
                    'anonymous_url_id','ad_slot_id','creative_id','key_page_url','user_tags']
    clk_df=pd.read_csv(path_clk)
    chunks =pd.read_csv(path_imp, chunksize=100000)
    df=pd.DataFrame()
    print('Begin preprocessing...')
    for chunk in tqdm(chunks):
        #Step 0
        chunk=create_click_col(chunk,clk_df)    
        #Step 1
        chunk=get_date(chunk)
        #Step 2
        chunk.drop(columns=cols_to_be_dropped,axis=1,inplace=True)
        #Step 3
        col_creator= user_agent(chunk)
        chunk = col_creator.create_cols()
        #Step 4
        chunk=numerise_features(chunk)
        #Step 5
        chunk=categorize_features(chunk)
        if mode=='Test':
            chunk.drop(columns=['click'],axis=1,inplace=True)
        #Concatenation.
        df = pd.concat([df,chunk],ignore_index=True)
    print('Ufff! Finally, I am done.')
    return df

In [None]:
test_df = get_data(path_clk=path_clk,path_imp='TEST/tst_logs.csv',mode='Test')

Begin preprocessing...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
26it [01:03,  2.42s/it]

Ufff! Finally, I am done.





In [None]:
test_df.head()

Unnamed: 0,day,hour,browser,os,ad_exchange,ad_slot_width,ad_slot_height,ad_slot_visibility,ad_slot,ad_slot_floor_price,bidding_price,paying_price,advertiser_id,region_id
0,3,0,chrome,windows,1,950,90,0,1,0,227,29,3358,94
1,3,0,chrome,windows,1,950,90,0,1,0,227,20,3427,146
2,3,0,ie,windows,2,728,90,2,0,5,238,63,3427,79
3,3,0,chrome,windows,1,300,250,0,5,0,300,293,3386,40
4,3,0,chrome,windows,1,300,250,2,1,0,227,120,3358,183


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2521627 entries, 0 to 2521626
Data columns (total 14 columns):
 #   Column               Dtype   
---  ------               -----   
 0   day                  int64   
 1   hour                 int64   
 2   browser              category
 3   os                   category
 4   ad_exchange          category
 5   ad_slot_width        int32   
 6   ad_slot_height       int32   
 7   ad_slot_visibility   category
 8   ad_slot              category
 9   ad_slot_floor_price  int32   
 10  bidding_price        int32   
 11  paying_price         int32   
 12  advertiser_id        int64   
 13  region_id            category
dtypes: category(6), int32(5), int64(3)
memory usage: 120.2 MB


In [None]:
#Checking if there are some null values.
train_df.isnull().sum()

day                    0
hour                   0
click                  0
browser                0
os                     0
ad_exchange            0
ad_slot_width          0
ad_slot_height         0
ad_slot_visibility     0
ad_slot                0
ad_slot_floor_price    0
bidding_price          0
paying_price           0
advertiser_id          0
region_id              0
dtype: int64

In [None]:
#Writing the dataframe as a csv file for further usage.
train_df.to_csv('TRAIN/train.csv')

In [None]:
df = pd.read_csv('TRAIN/train.csv')

In [None]:
df['advertiser_id'] = df['advertiser_id'].astype('category')
df.info()