In [1]:
import pandas as pd
import numpy as np
import json 
import re
from pandas import json_normalize
from datetime import datetime
from sklearn.preprocessing import LabelEncoder


In [177]:
train_df=pd.read_csv('data_given/train.csv')
test_df=pd.read_csv('data_given/test.csv')



In [6]:
 ## Stages of Data Preprocessing and Data transformation ##


#1.Function for extracting features from date column

def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d") # seting the column as pandas datetime
    df["weekday"] = df['date'].dt.weekday #extracting week day
    df["day"] = df['date'].dt.day # extracting day
    df["month"] = df['date'].dt.month # extracting month
    df["year"] = df['date'].dt.year # extracting year
    df['visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    
    return df
train_df=date_process(train_df)
test_df=date_process(test_df)

#2.Function to validate the json columns in the dataset    
def column_validator(df):
    cols=[]
    for col in df.columns:
        if( type(df[col].values[0]) == str ):
            txt = df[col].values[0]
            if re.search("^{.*}$",txt):
                cols.append(col)
    return cols  # Returns the json columns

#2.1 Function for flattening the json columns and merge them with original dataset
def json_to_df(df,json_columns):
    for column in json_columns:
        column_to_df= json_normalize([json.loads(x) for x in df[column]])
        df= df.drop(column, axis=1).merge(column_to_df, right_index=True, left_index=True) # drop the flattened column from the original dataset
    return df ## returns new dataframe with flattened json columns


train_json_columns = column_validator(train_df)
test_json_columns = column_validator(test_df)

if train_json_columns is not None:
    train_df=json_to_df(train_df,train_json_columns)
    
if test_json_columns is not None:    
    test_df=json_to_df(test_df,test_json_columns)


#3.Dropping columns which have more than 50% of null values and not contributing to the target variable

def remove_nan_cols(df)
    for col in df.columns:
        if (df[col].isnull().sum() >  (0.5 * len(df))):
            df.drop(col,axis=1,inplace=True)
    return None
remove_nan_cols(train_df)
remove_nan_cols(test_df)
 

#4.Imputation of null values 
def impute_na(df):
    for col in df.columns:
        df[col].fillna(0.inplace=True)
    return None

impute_na(train_df)
impute_na(test_df)

train_df['adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
train_df[train_df['city'] == "(not set)"]['city'] = np.nan
train_df['city'].fillna("NaN", inplace=True)
    

test_df['isTrueDirect'].fillna(False, inplace=True) 
test_df['adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
test_df[test_df['city'] == "(not set)"]['city'] = np.nan
test_df['city'].fillna("NaN", inplace=True)   

#5.Changing datatypes from object to desired ones
def data_type_convert(df):
    for col in df.columns:
        if ( type(df[col][0]) == str and df[col][0].isdigit() ):
            print(col)
            df[col]=df[col].astype(int)

data_type_convert(train_df)
data_type_convert(test_df)

#6. Removing columns with constant values or with zero standard deviation
def remove_zero_std_cols(df):
    for column in df.columns:
        if (df[column].nunique()==1):
            df.drop(column,axis=1,inplace=True)
    return df

train_df=remove_zero_std_cols(train_df)
test_df=remove_zero_std_cols(test_df)


#7. sessionId col is removed as it is unique id and does'nt contribute to the target variable
train_df.drop('sessionId',axis=1,inplace=True)
test_df.drop('sessionId',axis=1,inplace=True)
train_df.drop('visitStartTime',axis=1,inplace=True) #removing visitStartime as it was extraced into visitHour
test_df.drop('visitStartTime',axis=1,inplace=True)


#8 Function to gather categorical columns in the dataset
def categorical_cols(df):
    cat_cols=[]
    for col in df.columns:
        if (type(df[col][0]) == str or type(df[col][0]) == np.bool_) :
            cat_cols.append(col)
    return cat_cols # returns categorical columns in the dataset
    
train_cat_cols=categorical_cols(train_df)   # collecting all categorical columns in the train dataset 
test_cat_cols=categorical_cols(test_df)     # collecting all categorical columns in the test dataset

#8.1 Function for encoding categorical values to numerical values

def label_encoding(cat_cols,df):
    # creating instance of labelencoder
    labelencoder = LabelEncoder()
    # Assigning numerical values and storing in same column
    for column in cat_cols:
        df[column] = labelencoder.fit_transform(df[column].astype(str))
    return df

label_encoding(train_cat_cols,train_df)
label_encoding(test_cat_cols,test_df) 

#9. Imputing pageviews column with KNNImputer in both train and test data

from sklearn.impute import KNNImputer
imputer=KNNImputer()

imputer_train_df=imputer.fit_transform(train_df[['pageviews']]) ## Imputing pageviews with KNNimputer in training data
train_df['pageviews']=imputer_train_df

imputer_test_df=imputer.fit_transform(test_df[['pageviews']]) ## imputing pageviews with KNNimputer in test data
test_df['pageviews']=imputer_test_df



In [None]:
#10. Model Training begins here 

import datetime

X = train_df[train_df['date']<=datetime.date(2017,5,31)] ## train data for the months till May 31 2017
val_X = train_df[train_df['date']>datetime.date(2017,5,31)] ## validation data for the months of june 2017 and july 2017

X = X.drop(['date'], axis = 1)
val_X = train_df.drop(['date'], axis = 1)

y = np.log1p(X["transactionRevenue"].values)
val_y = np.log1p(val_X["transactionRevenue"].values)
y = pd.DataFrame(y)
val_y = pd.DataFrame(val_y)
x = X.drop(['transactionRevenue'], axis = 1)
val_x = val_X.drop(['transactionRevenue'], axis = 1)

In [None]:
%%time
import xgboost as xg 
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

model = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 1000, verbosity=1,learning_rate=0.5,max_depth=8)
model.fit(x, y) 

In [None]:
#11. Metrics 
model.score(x,y)
y_train_predict = model.predict(x)
rmse = (np.sqrt(MSE(y, y_train_predict)))
r2 = r2_score(y, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set

y_test_predict = model.predict(val_x)
# root mean square error of the model
rmse = (np.sqrt(MSE(val_y, y_test_predict)))

# r-squared score of the model
r2 = r2_score(val_y, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:
#12. Creating Pickle file and generating the output with pickle file
import pickle
filename = 'xgboost_model.pickle'
pickle.dump(model, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
pred_transaction_rev=loaded_model.predict(test_df) # Predicting with Test data
output_df=pd.DataFrame(pred_transaction_rev) # converting predicted values into Dataframe 
output_df.to_csv('predicted_file.csv') # Saving the predicted values into csv file