In [None]:
#Mounting the drive to the colab workspace.
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


#Notebook 1: Dataset Prepartion
****
**Overview of the notebook**
* The bz2 file of the iPinYou dataset is downloaded from [here](https://figshare.com/articles/dataset/ipinyou_contest_dataset_season2/5732328/1).
* The file is unzipped in to '*Datasets/ipinyou*'.
* Following which the data for different days were composed to form a single log file using the shell script ''.
* Then suitable python scripts are written to convert the log.txt files in to csv files which are stored in TRAIN and TEST respectively.


In [None]:
#Importing the required libraries.
import os
import pandas as pd
import bz2
import csv
import codecs
import itertools
import time

In [None]:
#Navigating to main directory.
root_dir = r'/content/drive/My Drive/HS4007/Real_Time_Bidding'
os.chdir(root_dir)
#Sanity Check.
!pwd

/content/drive/My Drive/HS4007/Real_Time_Bidding


## Running the shell script

In [None]:
%%shell

echo "Hi,I am bash and I'm here to make your life easier!"

echo "Current Working Directory: $PWD"

ipin=./Datasets/ipinyou
#Path for training and test datasets.
train=./Datasets/ipinyou/train
test=./Datasets/ipinyou/test

#Creating the directories for train and test if they do not exist.
if [ ! -d "$train" ]
then
   mkdir -p $train 
fi

if [ ! -d "$test" ]
then
   mkdir -p $test 
fi
 
echo "I just  made the train and test folders at the path you mentioned!"

__unzip__ (){
    echo "I am going to unzip bz2 files." 
    cp $ipin/training2nd/imp.*.bz2 $train
    cp $ipin/training2nd/clk.*.bz2 $train
    bzip2 -d $train/* 
    cp $ipin/testing2nd/* $test
    bzip2 -d $test/*
    echo "Done done done!!!"
}

#Run this only if the 'Datasets/ipinyou/train' folder is empty.
train_files=(${train}/*.txt)
test_files=(${test}/*.txt)

if ((${#train_files[@]} && ${#test_files[@]}))
then
    echo "Already unzipped."
else
    echo "Unzipping..."
    __unzip__
    echo "Done unzipping." 
fi


if [ ! -f "$train/clk_logs.txt" ]
then 
    echo "I am combining all the logs of click in to single file."
    cat $train/clk*.txt > $train/clk_logs.txt
else
    echo "Already prepared a single clicks logs file."
fi

if [ ! -f "$train/imp_logs.txt" ]
then 
    echo "I am combining all the logs of impressions in to a single file."
    cat $train/imp*.txt > $train/imp_logs.txt
else
    echo "Already prepared a single impressions logs file."
fi

if [ ! -f "$test/raw_test.txt" ]
then
    cat $test/*.txt > $test/raw_test.txt
else
    echo "Test file is also prepared."
fi

Hi,I am bash and I'm here to make your life easier!
Current Working Directory: /content/drive/My Drive/HS4007/Real_Time_Bidding
I just  made the train and test folders at the path you mentioned!
Already unzipped.
Already prepared a single clicks logs file.
Already prepared a single impressions logs file.
Test file is also prepared.




## Creation of csv files and dataframes

In [None]:
class read_dataset(object):
    '''
    A base class for reading the log data files that can be used
    for getting files of all advertisers.
    '''
    def __init__(self,dataset_path,target_file_path):
        '''
        Initializing the instace of the base class.
        
        Args:
             dataset_path (str)-> The path in which training or
                                  testing datset is located.
             target_file (str) -> Path to which the created csv file 
                                  should be stored.
        '''
        self.path = dataset_path
        self.tf = target_file_path
        self.columns = ['bid_id','timestamp','log_type','ipinyou_id','user_agent','ip_address',
        'region_id','city_id','ad_exchange','domain','url','anonymous_url_id',
        'ad_slot_id','ad_slot_width','ad_slot_height','ad_slot_visibility',
        'ad_slot','ad_slot_floor_price','creative_id','bidding_price',
        'paying_price','key_page_url','advertiser_id','user_tags']

    def print_progress(self,line):
        '''
        A method that prints the progress of write/read.

        Args:

             line (list) -> The current line being processed.
        '''
        pass

    def reject_row(self,row):
        '''
        A method for filtering the rows.

        Args:
             
             row (list) -> The row to be verified.
        
        Returns: (bool) -> If True, reject the row.
        '''
        if len(row)<24:
            return True
        else:
            return False


    def get_csv(self):
        '''
        A method which converts the txt file into a csv file 
        that is readymade for pandas.
        '''
        #Checking the existence of a file.
        if os.path.isfile(self.tf):
            print(f'Would you look at that, the csv file has already been created.')
            return
        #Opeing the files.
        with codecs.open(self.path,'r',encoding='utf-8',errors='ignore') as csv_file:
            logs_reader = csv.reader(csv_file, delimiter='\t')
            print(f'Let me begin the writing of the csv file...')
            
            w_progress = 0
            with open(self.tf, 'w') as new_csv_file:
                #The column names.
                fieldnames = self.columns

                csv_writer = csv.DictWriter(new_csv_file,fieldnames=fieldnames)
                csv_writer.writeheader()

                for row in logs_reader:
                    # A filter for rows.
                    if self.reject_row(row):
                        continue
                    csv_writer.writerow(self.apply_schema(row))
        print(f'Written Sucesfully')


    def apply_schema(self,row):
        '''
        A method that converts a row read from the text file in to 
        a meaningful log.

        Args:
             row (list) -> corresponds to elements from a line 
                           of the text file.
        Returns: (dict)
        '''
        log = {}
        index = list(range(len(row)))

        for key,value in zip(self.columns,index):
            log[key] = row[value]
            
        return log






        

## Training Datasets Prep

In [None]:
#Paths to impressions and clicks datafile for training.
imp_log_path = root_dir + "/Datasets/ipinyou/train/imp_logs.txt"
clk_log_path = root_dir + "/Datasets/ipinyou/train/clk_logs.txt"
tst_log_path = root_dir + "//Datasets/ipinyou/test/raw_test.txt"

#Arguments for the class instances.
imp_kargs=  {'dataset_path': imp_log_path,'target_file_path':'TRAIN/imp_logs.csv'}
clk_kargs = {'dataset_path': clk_log_path, 'target_file_path':'TRAIN/clk_logs.csv'}
tst_kargs = {'dataset_path': tst_log_path, 'target_file_path':'TEST/tst_logs.csv'}

logs = ['imp','clk']
#An instace of classes.
imp_reader = read_dataset(**imp_kargs)
clk_reader = read_dataset(**clk_kargs)
tst_reader=read_dataset(**tst_kargs)
readers=[imp_reader,clk_reader,tst_reader]

#Creating the csv files.
for reader in readers:
    reader.get_csv()



Would you look at that, the csv file has already been created.
Would you look at that, the csv file has already been created.
Would you look at that, the csv file has already been created.
