In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Jovian commit essentials
# import jovian
# jovian.set_project('course-project-machine-learning-local')

# Data analysis 
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np

# ML models
import xgboost as xgb
from xgboost import plot_tree
from xgboost import plot_importance
import lightgbm as lgb

# Models performance evaluation
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# File saving
import joblib

# Plotting
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
%matplotlib inline
import seaborn as sns
color = sns.color_palette()

# Miscellaneous
import warnings
from tabulate import tabulate
import gc

In [None]:
import os
# import opendatasets as od
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

In [None]:
# Since the zip archives have multiple files, we need to unzip them first, then use de read_csv for each file, while converting numerical columns to each dtype:
from zipfile import ZipFile

path_dataset = '/kaggle/input/instacart-market-basket-analysis'
zip_file = ZipFile(path_dataset+'/orders.csv.zip')
orders_df = pd.read_csv(zip_file.open('orders.csv'), dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

zip_file = ZipFile(path_dataset+'/products.csv.zip')
products_df = pd.read_csv(zip_file.open('products.csv'), dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

zip_file = ZipFile(path_dataset+'/order_products__train.csv.zip')
order_products__train_df = pd.read_csv(zip_file.open('order_products__train.csv'), dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

zip_file = ZipFile(path_dataset+'/order_products__prior.csv.zip')
order_products__prior_df = pd.read_csv(zip_file.open('order_products__prior.csv'), dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

zip_file = ZipFile(path_dataset+'/aisles.csv.zip')
aisles_df = pd.read_csv(zip_file.open('aisles.csv'))

zip_file = ZipFile(path_dataset+'/departments.csv.zip')
departments_df = pd.read_csv(zip_file.open('departments.csv'))

zip_file = ZipFile(path_dataset+'/sample_submission.csv.zip')
sample_submission = pd.read_csv(zip_file.open('sample_submission.csv'))


In [None]:
# Checking all the created dataframes
orders_df.head()

In [None]:
products_df.head()
aisles_df.head()
departments_df.head()

In [None]:
# Merging product related dfs as a single products_df
products_df = products_df.merge(aisles_df).merge(departments_df)
products_df.drop(['aisle_id', 'department_id'], axis=1, inplace=True)
products_df

In [None]:
order_products__train_df
order_products__prior_df
orders_df['eval_set'].unique()
# Checking the sample submission file
sample_submission

In [None]:
# Checking the total amount of individual costumers present in the orders_df.
def get_unique_counts(x):
    return len(np.unique(x))

print(orders_df.groupby('eval_set')['user_id'].aggregate(get_unique_counts))

In [None]:
# Checking the occurrences of the maximum order numbers
order_num = orders_df.groupby('user_id')['order_number'].aggregate(np.max).reset_index()
order_num = order_num['order_number'].value_counts()

plt.figure(figsize=(18,10))
sns.barplot(x=order_num.index, y=order_num.values, alpha=0.8, color='green')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Maximum order number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Checking the orders by day of the week
orders_dow = orders_df['order_dow']

plt.figure(figsize=(12,10))
sns.countplot(x=orders_dow, data=orders_df, alpha=0.8, color='red')
plt.ylabel('Number of orders', fontsize=12)
plt.xlabel('Day of the week', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Checking the orders by hour of the day
orders_hotd = orders_df['order_hour_of_day']

plt.figure(figsize=(18,10))
sns.countplot(x=orders_hotd, data=orders_df, alpha=0.9, color='purple')
plt.ylabel('Number of orders', fontsize=12)
plt.xlabel('Hour of the day', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Checking the amount of days since last order
orders_dspo = orders_df['days_since_prior_order']

plt.figure(figsize=(18,10))
sns.countplot(x=orders_dspo, data=orders_df, alpha=0.9, color='orange')
plt.ylabel('Number of orders', fontsize=12)
plt.xlabel('Days since prior order', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
product_amount = order_products__train_df.groupby('order_id')['add_to_cart_order'].aggregate(np.max).reset_index()
product_count = product_amount.add_to_cart_order.value_counts()

fig, ax = plt.subplots(figsize=(18,10))

ax.bar(x=product_count.index, height=product_count.values, alpha=0.9, color = 'red')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Number of Items', fontsize=12)
plt.xticks(rotation='vertical')
ax.annotate('Most common amount of items per order: {:.0f}'.format(product_count.idxmax()), xy=(6, 8900), xytext=(11, 8300), fontsize=12, arrowprops=dict(facecolor='black', shrink=0.05))
plt.show()

In [None]:
bestsellers = order_products__prior_df.groupby('product_id')['add_to_cart_order'].count().sort_values(ascending=False).reset_index()
# Merging dfs to get the product names for the best sellers
bestsellers = bestsellers.merge(products_df[['product_id','product_name']])
# Checking top 10
bestsellers.head(10)

In [None]:
bestsellers_train = order_products__train_df.groupby('product_id')['add_to_cart_order'].count().sort_values(ascending=False).reset_index()
# Merging dfs to get the product names for the best sellers
bestsellers_train = bestsellers_train.merge(products_df[['product_id','product_name']], on='product_id')
# Checking top 10
bestsellers_train.head(10)

In [None]:
products_df.head()

In [None]:
# merging products related dataframes with orders
products = products_df.merge(aisles_df).merge(departments_df)
products.drop(["aisle_id", "department_id"], axis=1, inplace=True)
ordert = order_products__train_df.merge(orders_df[["order_id","user_id"]])
orders_products = order_products__prior_df.merge(orders_df)

In [None]:
# getting products ordered first and second times to calculate probability
prd2 = orders_products.sort_values(['user_id', 'order_number', 'product_id'], ascending=True)
prd2['product_time'] = orders_products.groupby(['user_id', 'product_id']).cumcount()+1
sub1 = prd2[prd2['product_time'] == 1].groupby('product_id').size().to_frame('prod_first_orders')
sub2 = prd2[prd2['product_time'] == 2].groupby('product_id').size().to_frame('prod_second_orders')
sub1['prod_orders'] = prd2.groupby('product_id')['product_id'].size()
sub1['prod_reorders'] = prd2.groupby('product_id')['reordered'].sum()
sub2 = sub2.reset_index().merge(sub1.reset_index())
# calculating probability and ratio
sub2['prod_reorder_probability'] = sub2['prod_second_orders']/sub2['prod_first_orders']
sub2['prod_reorder_ratio'] = sub2['prod_reorders']/sub2['prod_orders']

prd = sub2[['product_id', 'prod_orders','prod_reorder_probability', 'prod_reorder_ratio']]

#release memory
del sub1, sub2, prd2
gc.collect()

In [None]:
# merging users related dataframes with orders
orders_df["user_mean_days_since_prior"] = orders_df.days_since_prior_order
users = orders_df[orders_df.eval_set == "prior"].groupby("user_id").agg({ "order_number" : "max", "days_since_prior_order" : "sum", "user_mean_days_since_prior" : "mean" }).rename(columns = { "order_number" : "user_orders", "days_since_prior_order" : "user_period" })
users.reset_index(inplace=True)
# calculating reordered amount, amount of orders, and number of distinct products 
us = orders_products.groupby("user_id").agg({ "order_id" : "count", "reordered" : "sum", "order_number" : lambda rows: rows[rows > 1].shape[0], "product_id" : "nunique" }).rename(columns = { "order_id" : "user_total_products", "product_id" : "user_distinct_products"})
# calculating the user reorder ratio
us["user_reorder_ratio"] = us.reordered / us.order_number
us.drop(["reordered", "order_number"], axis=1, inplace=True)
us.reset_index(inplace=True)
# merging data
users = users.merge(us)
users["user_average_basket"] = users.user_total_products / users.user_orders
us = orders_df[orders_df.eval_set != "prior"][["user_id", "order_id", "eval_set", "days_since_prior_order"]].rename(columns = { "days_since_prior_order" : "time_since_last_order" })
users = users.merge(us)

In [None]:
# data final preparation for modelling
orders_products["up_orders"] = orders_products["up_first_order"] = orders_products["up_last_order"] = orders_products.order_number
data = orders_products.groupby(["user_id", "product_id"]).agg({ "up_orders" : "count", "up_first_order" : "min", "up_last_order" : "max", "add_to_cart_order" : "mean" }).rename(columns={ "add_to_cart_order" : "up_average_cart_position" })
data.reset_index(inplace=True)
# merging products and users information
data = data.merge(prd).merge(users)
# calculating the user + product order rate, orders since last order and order rate since last order
data["up_order_rate"] = data.up_orders / data.user_orders
data["up_orders_since_last_order"] = data.user_orders - data.up_last_order
data["up_order_rate_since_first_order"] = data.up_orders / (data.user_orders - data.up_first_order + 1)
# merging train info to the data
data = data.merge(ordert[["user_id", "product_id", "reordered"]], how="left", on=["user_id", "product_id"])
# filling null values with the reordered = 0
data.loc[pd.isnull(data.reordered), "reordered"] = 0
data

**XGBoost**

In [None]:
# split test and train data and features to be discarded 
train = data[data.eval_set == "train"]
test = data[data.eval_set == "test"]
subtrain = train.sample(frac=1.0)
discard_fields = ["eval_set", "user_id", "product_id", "order_id", "reordered"]
X = xgb.DMatrix(subtrain.drop(discard_fields, axis=1), label=subtrain.reordered)

In [None]:
# set model parameters and train
params = {'max_depth':10, 
         'eta':0.02,
         'colsample_bytree':0.4,
         'subsample':0.75,
         'silent':1,
         'nthread':27,
         'eval_metric':'logloss',
         'binary':'logistic',
         'tree_method':'hist'}

#nrounds = 1000 / early_stopping_rounds=40
#lowered to 10 for project evaluation
nrounds = 10

xgb_model = xgb.train(params, X, nrounds, verbose_eval=5)

In [None]:
# plotting feature importance
plt.rcParams["figure.figsize"] = (14, 7)
xgb.plot_importance(xgb_model)
plt.show()

In [None]:
# Applying model with a 0.21 probability threshold
probability_threshold = 0.21
X = xgb.DMatrix(test.drop(discard_fields, axis=1))
# predict reordered products
test["reordered"] = xgb_model.predict(X)
test["reordered"] = np.where(test.reordered > probability_threshold, 1, 0)
test.loc[:, 'product_id'] = test.product_id.astype(str)
# join reordered products for submission
submission = test[test.reordered == 1].groupby("order_id").agg({ "product_id" : lambda x: ' '.join(x) })
submission.rename(columns = {"product_id" : "products"}, inplace=True)
submission.reset_index(inplace=True)
# get only the results for the test set
test_set = orders_df[orders_df.eval_set == "test"][["order_id"]]
test_set = test_set.merge(submission, how="outer")
test_set.sort_values("order_id", inplace=True)
# save file for submission
test_set.to_csv(("sample_021_xgb2.csv"), columns=["order_id", "products"], index=False, na_rep="None")
test_set.head(10)

In [None]:
data_2 = pd.get_dummies(data=data, drop_first=True)
data_2.head(10)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        col_type2 = df[col].dtype.name
        
        if ((col_type != object) and (col_type2 != 'category')):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
data = reduce_mem_usage(data_2)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, random_state=100)
print(train.shape)
print(test.shape)

In [None]:
for col in data_2.columns:
    print(type(data_2[col][0]))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(train)

In [None]:
query_index = np.random.choice(train.shape[0])
# distances, indices = model_knn.kneighbors(train.iloc[query_index, :].values.reshape((1, -1)), n_neighbors = 6)

n_neighbors = 6
for i in range(train.shape[0]):
    distances, indices = model_knn.kneighbors(train.iloc[i, :].values.reshape((1, -1)), n_neighbors = n_neighbors)
    
    print(len(distances))
    print('Recommendations for {0}:\n'.format(i, train.index[indices.flatten()[i]]))
    for j in range(0, n_neighbors):
#         if i == j:
#         print('Recommendations for {0}:\n'.format(j, train.index[indices.flatten()[j]]))
#         else:
        print('{0}: {1}'.format(j, train.index[indices.flatten()[j]]))