In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Examining the data

In [None]:
df = pd.read_csv('/kaggle/input/ecommerce-data/data.csv')

In [None]:
df.head()

In [None]:
df.shape

**Handling NaN values**

In [None]:
df.isnull().any()
df.isnull().sum()

#isnull & isna are the same in python/pandas
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
#Seeing how the df has changed once the NaN values have been dropped

df.head()

In [None]:
df.shape

In [None]:
#The most frequent value in a column (of a dataframe) below
cus_id_1 = df['CustomerID'].value_counts().idxmax()
cus_id_1

In [None]:
#The amount of the most frequent value
cus_id_2 = df['CustomerID'].value_counts().max()
cus_id_2

**Unique values in df**

In [None]:
#Unique values in the dataframe
df.nunique()

**Indexing**

In [None]:
bought_mar_2010 = df.loc[df['InvoiceDate'] == '12/1/2010 8:26']
bought_mar_2010.shape

In [None]:
df.loc[df['InvoiceNo'] == 'C569944']

**Correlation**

In [None]:
#Correlation function
df.corr() #Not really any correlation

***TotalPrice - new column***

In [None]:
#Creating a new column with total cost of units (quantity * price) - added to dataframe
df['TotalPrice'] = df['UnitPrice'] * df['Quantity']

df['TotalPrice']

In [None]:
#Drop duplicates 

df = df.drop_duplicates()
df.shape

In [None]:
cost_p_trans = df.groupby(['InvoiceNo'])['TotalPrice'].sum()
cost_p_trans

# **Data Visualization**

In [None]:
import matplotlib.pyplot as plt

 **Frequency of most bought items itemcode wise**

In [None]:
# plt.rcParams['figure.figsize'] = (18, 7) #the figure size. The only important is the (18, 7) as it's different than the default figure size
color = plt.cm.copper(np.linspace(0, 1, 40)) #importance: color 
df['StockCode'].value_counts().head(40).plot.bar(color = color)
plt.title('Frequency of most bought items itemcode wise', fontsize=25)
plt.show()

# Top 5 countries: based on transactions

In [None]:
df['Country'].value_counts()[:5] #<- only selecting top 5

#EIRE is Ireland 

In [None]:
plt.figure()
df['Country'].value_counts()[:5].plot(kind = 'pie', autopct='%1.1f%%') #autopct -> shows the percentage
#plt.pie(df['Country'].value_counts()[:5], autopct='%1.1f%%')  #does the same thing but w/o the country labels
plt.title('Top 5 countries based on transactions', weight='bold') #makes the title bold
plt.show()

# Machine Learning - Predicting Quantity

In [None]:
df.head()

In [None]:
#Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
#X, y, and train_test_split

X = df[['UnitPrice','TotalPrice','CustomerID']]
y = df['Quantity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
#Average quantity

df['Quantity'].mean() 

**Linear Regression**

In [None]:
linreg = LinearRegression()

linreg.fit(X_train, y_train)
pred_reg = linreg.predict(X_test)
score_linreg_per = linreg.score(X_test, y_test) * 100
print(f'Linear Regression Model Score: {score_linreg_per}%')

In [None]:
#Mean Absolute Error: LINREG

val_mae_linreg = mean_absolute_error(pred_reg, y_test)
print(f'The quantity is off by: {val_mae_linreg} (MAE)') # <- How much the prediction off quantity is off by

In [None]:
#Cross validation: LINREG
cv_results_linreg = cross_val_score(linreg, X, y, cv=5) #this is important so that we're not dependent on the subjective number for our test size of our train_test_split
cv_results_linreg
np.mean(cv_results_linreg)

**DecisionTreeRegressor**

In [None]:
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds_val)
    return(mae)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
#Edit accordingly based on the best leaf nodes

dtr = DecisionTreeRegressor(max_leaf_nodes=500) # <- default number of lead nodes is unlimited

dtr.fit(X_train, y_train)
pred_dtr = dtr.predict(X_test)
score_dtr_per = dtr.score(X_test, y_test) * 100
print(f'Decision Tree Regressor Model Score: {score_dtr_per}%')

In [None]:
#MAE: DTR

val_mae_dtr = mean_absolute_error(pred_dtr, y_test)
print(f'The quantity is off by: {val_mae_dtr} (MAE)')

In [None]:
#CV: DTR
cv_results_dtr = cross_val_score(dtr, X, y, cv=5) #this is important so that we're not dependent on the subjective number for our test size of our train_test_split
cv_results_dtr
np.mean(cv_results_dtr)

**RandomForestRegressor**

In [None]:
rfg = RandomForestRegressor()

rfg.fit(X_train, y_train)
pred_rfg = rfg.predict(X_test)
score_rfg_per = rfg.score(X_test, y_test) * 100
print(f'Random Forest Regressor Model Score: {score_rfg_per}%')

In [None]:
#MAE: RFG

val_mae_rfg = mean_absolute_error(pred_rfg, y_test)
print(f'The quantity is off by: {val_mae_rfg} (MAE)')

In [None]:
#CV: RFG
cv_results_rfg = cross_val_score(dtr, X, y, cv=5) #this is important so that we're not dependent on the subjective number for our test size of our train_test_split
cv_results_rfg
np.mean(cv_results_rfg)

I took inspiration from the following notebook: https://www.kaggle.com/srinandhini/eda-on-market-basket-analysis