In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv")
train_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv")
sample_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv")



In [None]:
#Made a new feature from similar types of numerical features by adding them and then dropped them

train_data['HomePage_about']=train_data['HomePage']+train_data['HomePage_Duration']
train_data['LandingPage_about']=train_data['LandingPage']+train_data['LandingPage_Duration']
train_data["ProductDescriptionPage_about"]=train_data["ProductDescriptionPage"]+train_data["ProductDescriptionPage_Duration"]
train_data["GoogleMetric_about"]=train_data['GoogleMetric:Bounce Rates'] +train_data['GoogleMetric:Exit Rates']

train_data = train_data.drop(['HomePage', 'HomePage_Duration', 'LandingPage', 'LandingPage_Duration',
       'ProductDescriptionPage', 'ProductDescriptionPage_Duration',
       'GoogleMetric:Bounce Rates', 'GoogleMetric:Exit Rates'],axis=1)

In [None]:
#Same done on test data

test_data['HomePage_about']=test_data['HomePage']+test_data['HomePage_Duration']
test_data['LandingPage_about']=test_data['LandingPage']+test_data['LandingPage_Duration']
test_data["ProductDescriptionPage_about"]=test_data["ProductDescriptionPage"]+test_data["ProductDescriptionPage_Duration"]
test_data["GoogleMetric_about"]=test_data['GoogleMetric:Bounce Rates'] +test_data['GoogleMetric:Exit Rates']
test_data = test_data.drop(['HomePage', 'HomePage_Duration', 'LandingPage', 'LandingPage_Duration',
       'ProductDescriptionPage', 'ProductDescriptionPage_Duration',
       'GoogleMetric:Bounce Rates', 'GoogleMetric:Exit Rates'],axis=1)

In [None]:
#Filling the missing values of numerical Columns through KNN Imputation

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, PowerTransformer

from sklearn.impute import KNNImputer


numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()


knn_imputer = KNNImputer(n_neighbors=10, missing_values=np.nan)

train_data[numeric_cols] = knn_imputer.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = knn_imputer.transform(test_data[numeric_cols])


       

In [None]:
# Filling the missing values categorical columns through simple imputation using Mode

cat_cols = train_data.select_dtypes(include=['object']).columns.tolist()
mode_imputer = SimpleImputer(strategy='most_frequent')
train_data[cat_cols] = mode_imputer.fit_transform(train_data[cat_cols])
test_data[cat_cols] = mode_imputer.transform(test_data[cat_cols])


#Applied One-hot encoding to the categorical columns


cols_to_encode = cat_cols
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_cols_train = encoder.fit_transform(train_data[cols_to_encode])
encoded_cols_test = encoder.transform(test_data[cols_to_encode])

encoded_cols_train_df = pd.DataFrame(encoded_cols_train.toarray(), columns=encoder.get_feature_names_out(cols_to_encode))
encoded_cols_test_df = pd.DataFrame(encoded_cols_test.toarray(), columns=encoder.get_feature_names_out(cols_to_encode))

#Concatenate the numerical and encoded- categorical columns, then removed the categorical columns

train_data = pd.concat([train_data, encoded_cols_train_df], axis=1)
test_data = pd.concat([test_data, encoded_cols_test_df], axis=1)

train_data.drop(cols_to_encode, axis=1, inplace=True)
test_data.drop(cols_to_encode, axis=1, inplace=True)

#Used PowerTransformer to scale the data

columns_to_scale = numeric_cols
scaler = PowerTransformer()

train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])


In [None]:
#Applied PCA to get relevant features i.e features that captures 99% of the variance

from sklearn.decomposition import PCA
columns_to_pca = numeric_cols
pca=PCA()
pca.fit(train_data[columns_to_pca])
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.99) + 1
pca = PCA(n_components=n_components)

train_data_pca = pd.DataFrame(pca.fit_transform(train_data[columns_to_pca]), columns=['PCA_' + str(i) for i in range(1, n_components+1)])
test_data_pca = pd.DataFrame(pca.transform(test_data[columns_to_pca]), columns=['PCA_' + str(i) for i in range(1, n_components+1)])

train_data = pd.concat([train_data, train_data_pca], axis=1)
test_data = pd.concat([test_data, test_data_pca], axis=1)

train_data.drop(columns_to_pca, axis=1, inplace=True)
test_data.drop(columns_to_pca, axis=1, inplace=True)

In [None]:


X_train = train_data.drop('Made_Purchase', axis='columns')
y_train = train_data['Made_Purchase']


In [None]:
#Used Voting Classifier with multiple models

from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.ensemble import AdaBoostClassifier


X_train, y_train = shuffle(X_train, y_train, random_state=0)


model=VotingClassifier(estimators=[
    ('lr', LogisticRegression(solver='liblinear',penalty='l2', random_state=0) ),
    ('kn', KNeighborsClassifier(n_neighbors=50)),
    ('adb',AdaBoostClassifier())
    ],
    voting='soft')
model.fit(X_train, y_train)
res = cross_val_score(model, X_train, y_train, cv=35)
res.mean()

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(test_data)
y_pred = y_pred.astype(bool)
y_pred

In [None]:

sub = pd.DataFrame({'Id': [i for i in range(test_data.shape[0])],'Made_Purchase': y_pred} )
sub.to_csv("testOutput.csv", index=False)

output = pd.read_csv("testOutput.csv")