In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import zipfile
from subprocess import check_output

for dirname, _, filenames in os.walk('/kaggle/input/instacart-market-basket-analysis/'):
    for filename in filenames:        
        archive = zipfile.ZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        col_type2 = df[col].dtype.name
        
        if ((col_type != object) and (col_type2 != 'category')):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
import numpy as np

In [None]:
orders_df = reduce_mem_usage(pd.read_csv('../working/orders.csv'))
order_products__prior_df = reduce_mem_usage(pd.read_csv('../working/order_products__prior.csv'))
order_products__train_df = reduce_mem_usage(pd.read_csv('../working/order_products__train.csv'))
products_df = reduce_mem_usage(pd.read_csv('../working/products.csv'))
department_df = reduce_mem_usage(pd.read_csv('../working/departments.csv'))
aisles_df = reduce_mem_usage(pd.read_csv('../working/aisles.csv'))

In [None]:
variables = ['variables','orders_df','order_products__prior_df','order_products__train_df','products_df','department_df','aisles_df']

Check the columns in different dataframes

In [None]:
len(orders_df)

In [None]:
orders_df.eval_set.dtypes

In [None]:
sum(orders_df.order_id.value_counts().values)

In [None]:
orders_df.eval_set.value_counts()

In [None]:
orders_df.user_id.nunique()

In [None]:
orders_df.groupby('eval_set')['user_id'].nunique()

In [None]:
temp = orders_df.groupby(['user_id','eval_set'])['order_id'].nunique().reset_index()

In [None]:
temp.head(2)

In [None]:
import seaborn as sns

sns.distplot(x=temp['user_id'])

In [None]:
min(temp.groupby(['user_id'])['order_id'].sum())

In [None]:
max(temp.groupby(['user_id'])['order_id'].sum())

In [None]:
prior_df = orders_df[orders_df.eval_set == 'prior']

In [None]:
temp = reduce_mem_usage(pd.merge(left=prior_df, right=order_products__prior_df, on='order_id'))
temp.head()

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))

sns.histplot(x='order_hour_of_day',data=temp)

In [None]:
temp2 = reduce_mem_usage(pd.merge(left=products_df, right=aisles_df, on='aisle_id'))
temp3 = reduce_mem_usage(pd.merge(left=temp2, right=department_df, on='department_id'))
temp3

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15,6))

temp3['department'].value_counts().sort_values(ascending=True).plot(kind='barh',color='darkred')

In [None]:
from wordcloud import WordCloud,ImageColorGenerator

import matplotlib.pyplot as plt

#making of word cloud from Product name column
text = " ".join(topic for topic in temp3.product_name.astype(str))
print ("There are {} words in the combination of all Product Name.".format(len(text)))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", width=800, height=400).generate(text)

plt.axis("off")
plt.rcParams["figure.figsize"] = (15,6)
plt.tight_layout(pad=0)
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

In [None]:
final_prior_df = reduce_mem_usage(pd.merge(left=temp, right=temp3, on='product_id'))
final_prior_df.head()

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))

sns.countplot(x='reordered',data=final_prior_df)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,7))

final_prior_df['user_id'].value_counts().sort_values(ascending= False).head(20).plot(kind='bar',color='steelblue')

In [None]:
final_prior_df['days_since_prior_order'].dropna(inplace= True)

In [None]:
final_prior_df['days_since_prior_order'].value_counts().sort_values(ascending= False).plot(kind='bar',color='teal')

In [None]:
ord_count_per_prod = reduce_mem_usage(final_prior_df[['order_id','user_id','product_name']].groupby('product_name').nunique().reset_index())
ord_count_per_prod.head()

In [None]:
ord_count_per_prod['product_name'].nunique()

In [None]:
final_prior_df.isnull().sum()/len(final_prior_df)

In [None]:
final_prior_df[final_prior_df.days_since_prior_order.isnull()]['order_number'].nunique()
#gc.collect()

In [None]:
variables.append('final_prior_df')

In [None]:
def my_reset(varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

In [None]:
my_reset(variables) 

In [None]:
final_prior_df.head()

In [None]:
final_prior_df.groupby(['product_id','user_id','product_name'])['order_id'].value_counts().sort_values(ascending=False)

In [None]:
aisles_df.aisle

In [None]:
from wordcloud import WordCloud,ImageColorGenerator

import matplotlib.pyplot as plt

#making of word cloud from aisle column
text = " ".join(topic for topic in aisles_df.aisle.astype(str))
print ("There are {} words in the combination of all Aisles.".format(len(text)))

# Create and generate a word cloud image:
wordcloud = WordCloud(background_color="white", width=800, height=400).generate(text)

plt.axis("off")
plt.rcParams["figure.figsize"] = (15,6)
plt.tight_layout(pad=0)
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

In [None]:
my_reset(variables)

In [None]:
final_prior_df.columns

In [None]:
products=final_prior_df['product_name']

In [None]:
final_prior_df['order_dow'].value_counts().sort_values(ascending = False).plot(kind='bar',color='darkcyan')

In [None]:
import pandas as pd
dummies_df = pd.get_dummies(data=final_prior_df, prefix=['Day','Hour'], columns=['order_dow','order_hour_of_day'], drop_first=True)
dummies_df.head(10)

In [None]:
dummies_df.columns

In [None]:
user_prod_df = dummies_df.groupby(['user_id','product_id']).agg({'order_id':'nunique',
                                                                 'days_since_prior_order':'mean',
                                                                 'reordered':'max',
                                                                 'Day_1':'sum',
                                                                 'Day_2':'sum',
                                                                 'Day_3':'sum',
                                                                 'Day_4':'sum',
                                                                 'Day_5':'sum',
                                                                 'Day_6':'sum',
                                                                 'Hour_1':'sum',
                                                                 'Hour_2':'sum',
                                                                 'Hour_3':'sum',
                                                                 'Hour_4':'sum',
                                                                 'Hour_5':'sum',
                                                                 'Hour_6':'sum',
                                                                 'Hour_7':'sum',
                                                                 'Hour_8':'sum',
                                                                 'Hour_9':'sum',
                                                                 'Hour_10':'sum',
                                                                 'Hour_11':'sum',
                                                                 'Hour_12':'sum',
                                                                 'Hour_13':'sum',
                                                                 'Hour_14':'sum',
                                                                 'Hour_15':'sum',
                                                                 'Hour_16':'sum',
                                                                 'Hour_17':'sum',
                                                                 'Hour_18':'sum',
                                                                 'Hour_19':'sum',
                                                                 'Hour_20':'sum',
                                                                 'Hour_21':'sum',
                                                                 'Hour_22':'sum',
                                                                 'Hour_23':'sum'
                                                                }).reset_index()
user_prod_df.head(10)

In [None]:
user_purchase_df = dummies_df.groupby(['user_id']).agg({         'order_id':'nunique',
                                                                 'product_id': 'nunique',
                                                                 'days_since_prior_order':'mean',
                                                                 'reordered':'sum',
                                                                 'Day_1':'sum',
                                                                 'Day_2':'sum',
                                                                 'Day_3':'sum',
                                                                 'Day_4':'sum',
                                                                 'Day_5':'sum',
                                                                 'Day_6':'sum',
                                                                 'Hour_1':'sum',
                                                                 'Hour_2':'sum',
                                                                 'Hour_3':'sum',
                                                                 'Hour_4':'sum',
                                                                 'Hour_5':'sum',
                                                                 'Hour_6':'sum',
                                                                 'Hour_7':'sum',
                                                                 'Hour_8':'sum',
                                                                 'Hour_9':'sum',
                                                                 'Hour_10':'sum',
                                                                 'Hour_11':'sum',
                                                                 'Hour_12':'sum',
                                                                 'Hour_13':'sum',
                                                                 'Hour_14':'sum',
                                                                 'Hour_15':'sum',
                                                                 'Hour_16':'sum',
                                                                 'Hour_17':'sum',
                                                                 'Hour_18':'sum',
                                                                 'Hour_19':'sum',
                                                                 'Hour_20':'sum',
                                                                 'Hour_21':'sum',
                                                                 'Hour_22':'sum',
                                                                 'Hour_23':'sum'
                                                                }).reset_index()
user_purchase_df.head(10)

In [None]:
product_purchase_df = dummies_df.groupby(['product_id']).agg({   'order_id':'nunique',
                                                                 'user_id': 'nunique',
                                                                 'days_since_prior_order':'mean',
                                                                 'reordered':'sum',
                                                                 'Day_1':'sum',
                                                                 'Day_2':'sum',
                                                                 'Day_3':'sum',
                                                                 'Day_4':'sum',
                                                                 'Day_5':'sum',
                                                                 'Day_6':'sum',
                                                                 'Hour_1':'sum',
                                                                 'Hour_2':'sum',
                                                                 'Hour_3':'sum',
                                                                 'Hour_4':'sum',
                                                                 'Hour_5':'sum',
                                                                 'Hour_6':'sum',
                                                                 'Hour_7':'sum',
                                                                 'Hour_8':'sum',
                                                                 'Hour_9':'sum',
                                                                 'Hour_10':'sum',
                                                                 'Hour_11':'sum',
                                                                 'Hour_12':'sum',
                                                                 'Hour_13':'sum',
                                                                 'Hour_14':'sum',
                                                                 'Hour_15':'sum',
                                                                 'Hour_16':'sum',
                                                                 'Hour_17':'sum',
                                                                 'Hour_18':'sum',
                                                                 'Hour_19':'sum',
                                                                 'Hour_20':'sum',
                                                                 'Hour_21':'sum',
                                                                 'Hour_22':'sum',
                                                                 'Hour_23':'sum'
                                                                }).reset_index()
product_purchase_df.head(10)

In [None]:
temp = pd.merge(left=user_prod_df,  right=user_purchase_df, on='user_id', suffixes=('','_user'))
temp.head(10)

In [None]:
features_df = pd.merge(left=temp,  right=product_purchase_df, on='product_id', suffixes=('','_prod'))
features_df.head(10)

In [None]:
features_df.shape

In [None]:
features_df.info()

In [None]:
variables = ['features_df']
my_reset(variables)

In [None]:
import numpy as np
reduce_features_df = reduce_mem_usage(features_df)

In [None]:
variables = ['reduce_features_df']
my_reset(variables)

In [None]:
reduce_features_df.isnull().sum()

In [None]:
reduce_features_df.drop(columns=['days_since_prior_order'],inplace= True)

In [None]:
reduced_feature= reduce_features_df[:1000]

In [None]:
reduced_feature.head(1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(reduced_feature, test_size=0.3, random_state=100)
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(X_train)


In [None]:
import numpy as np
query_index = np.random.choice(X_train.shape[0])
distances, indices = model_knn.kneighbors(X_train.iloc[query_index, :].values.reshape((1, -1)), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(i, X_train.index[indices.flatten()[i]]))
    else:
        print('{0}: {1}'.format(i, X_train.index[indices.flatten()[i]]))

In [None]:
products_df[products_df['product_id']==655].product_name


In [None]:
products_df[products_df['product_id']==659].product_name

In [None]:
products_df[products_df['product_id']==658].product_name

In [None]:
products_df[products_df['product_id']==661].product_name

In [None]:
products_df[products_df['product_id']==654].product_name