In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
import random as rnd
import csv

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from sklearn.metrics import accuracy_score


In [None]:
# 1. Acquire Data
#=================
trainData = pd.read_csv('../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv',nrows=1000)
testData = pd.read_csv('../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv', skiprows=range(1, 1000), nrows=1000)
combine=[trainData, testData]
print(trainData.shape)
print(testData.shape)

In [None]:
# Preview the data
testData.head(10)

In [None]:
# Analyze by describing data
print(trainData.columns.values)

In [None]:
# Rename category name in both files
trainData.rename(columns={" MV ": "MV", "category_name_1": "category_name"}, inplace = True)
testData.rename(columns={" MV ": "MV", "category_name_1": "category_name"}, inplace = True)

In [None]:
# Remove unnamed feature columns from both testData and trainData
trainData.drop(trainData.columns[trainData.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
testData.drop(testData.columns[testData.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [None]:
# View Data columns
print(trainData.shape)
print(testData.shape)

In [None]:
# Lets observe datatypes of features in the dataset
trainData.info()
print('_'*40)
testData.info()

In [None]:
# What is the distribution of numerical features
trainData.describe()

In [None]:
# Check that columns have null values or not
trainData.count().sort_values()

In [None]:
# What is the distribution of categorical features
trainData.describe(include=['O'])

In [None]:
# Wrangle data
# Removing unwanted features
trainData = trainData.drop(['sales_commission_code', 'MV','increment_id', 'sku', 'FY'], axis=1)
testData = testData.drop(['sales_commission_code', 'MV','increment_id', 'sku', 'FY'], axis=1)
combine = [trainData, testData]
#view data
trainData.head(10)

In [None]:
# Check if any relationship exists between 'status' and 'BI Status' columns
trainData.groupby('BI Status')['status'].value_counts()

In [None]:
# Replce REF to Unk (unknown) for better cleaning
trainData['BI Status'] = trainData['BI Status'].replace('#REF!', 'Unk')

In [None]:
# Check categories that are null
trainData['category_name'].value_counts()

In [None]:
# Replacing the categories unicode label and NaN values with label 'Unknown'
trainData['category_name'] = trainData['category_name'].replace(r'\\N', 'Unknown', regex=True)
trainData['category_name'].fillna("Unknown",inplace=True)

In [None]:
# Check if any relationship exists between 'category' and 'status' columns
trainData.groupby('category_name')['status'].value_counts()

In [None]:
#Status: We confirm the observation during problem definition that 
# Status=canceled had very high amount that payback to customers (classifying #1).
trainData[["status","price"]].groupby(['status'], as_index=False).mean(). sort_values(by='price', ascending=False)

In [None]:
# Convert the Status categorical status to ordinal.
title_mapping = {"complete": 1, "canceled": 2, "order_refunded": 3, "refund": 4, "received": 5}
for dataset in combine:
    dataset['status'] = dataset['status'].map(title_mapping)
    dataset['status'] = dataset['status'].fillna(0)

trainData.head()

In [None]:
# View status records
trainData["status"].value_counts()

In [None]:
# What is the best-selling category?
# The best selling category having max complete records is 'Beauty & Grooming'
complete_dt = trainData[(trainData['status'] == 1)]
complete_dt.groupby('category_name')['status'].value_counts().sort_values()

In [None]:
# View payment method records
trainData["payment_method"].value_counts()
# Visualize payment method and order status frequency
trainData.groupby("payment_method")["status"].value_counts()

In [None]:
# Correlating numerical features of payment_method and status
g = sns.FacetGrid(trainData, col='payment_method')
g.map(plt.hist, 'status', bins=10)

In [None]:
# Correlating categorical and numerical features
grid = sns.FacetGrid(trainData, row='status', height=3, aspect=2)
grid.map(sns.pointplot, 'payment_method', 'Customer ID', 'category_name', palette='deep')
grid.add_legend()

In [None]:
# Convert the payment_method categorical status to ordinal.
title_mapping = {"cod": 1, "mygateway": 2, "ublcreditcard": 3, "cashatdoorstep":4, "customercredit": 5, "customercredit":6
                 , "mcblite": 7, "internetbanking": 8
                 , "productcredit": 9, "marketingexpense": 10}
for dataset in combine:
    dataset['payment_method'] = dataset['payment_method'].map(title_mapping).astype(int)

# view data
trainData["payment_method"].value_counts()

In [None]:
# Convert the category_name categorical status to ordinal.
title_mapping = {"Beauty & Grooming": 1, "Soghaat": 2, "Men's Fashion": 3, "Women's Fashion": 4
                 , "Mobiles & Tablets": 5, "Home & Living": 6, "Appliances": 7, "Unknown": 8
                , "Kids & Baby": 9, "Computing": 10,"Health & Sports": 11,"Others": 12
                ,"Entertainment": 13,"Books": 14,"Superstore": 15}
for dataset in combine:
    dataset['category_name'] = dataset['category_name'].map(title_mapping)
    dataset['category_name'] = dataset['category_name'].fillna(0)

# view data
testData["category_name"].value_counts()

In [None]:
# Create new feature combining existing features
# Create a new feature for TotalPrice which combines price and qty_ordered. 
# This will enable us to drop price and qty_ordered from our datasets.

for dataset in combine:
    dataset['TotalPrice'] = (dataset['price'] + dataset['qty_ordered'] + 1)

In [None]:
# Get corelation between payment method and total price
trainData[['payment_method', 'TotalPrice']].groupby(['payment_method'], as_index=False). mean().sort_values(by='TotalPrice', ascending=False)

In [None]:
# Get corelation between payment method and total price
trainData[['status', 'TotalPrice']].groupby(['status'], as_index=False). mean().sort_values(by='TotalPrice', ascending=False)

In [None]:
# Drop price and qty_ordered features.
trainData = trainData.drop(['price', 'qty_ordered'], axis=1)
testData = testData.drop(['price', 'qty_ordered'], axis=1)
combine = [trainData, testData]

trainData.head()

In [None]:
trainData['BI Status'].value_counts()

In [None]:
# Convert the BI Status categorical status to ordinal.
for dataset in combine:
    dataset['BI Status'] = dataset['BI Status'].map({"Net": 1, "Gross": 2, "Valid": 3, "Unk": 4}).astype(int)

In [None]:
# Working with completed transactions
df_sales_segment = trainData.groupby('Customer ID')['TotalPrice'].sum().reset_index()
df_sales_segment.loc[df_sales_segment['TotalPrice'] > 5000000, :]
df_sales_segment

In [None]:
# Segmentation based on net amout spent by each customer on E-commerce store
df_sales_segment['sales_segment'] = ''
df_sales_segment.loc[df_sales_segment['TotalPrice'] <= 1000, 'sales_segment'] = 'very low'
df_sales_segment.loc[(df_sales_segment['TotalPrice'] > 1000) & (df_sales_segment['TotalPrice'] <= 10000), 'sales_segment'] = 'low'
df_sales_segment.loc[(df_sales_segment['TotalPrice'] > 10000) & (df_sales_segment['TotalPrice'] <= 50000), 'sales_segment'] = 'medium'
df_sales_segment.loc[df_sales_segment['TotalPrice'] > 50000, 'sales_segment'] = 'high'
df_sales_segment

In [None]:
sales_segment_total = df_sales_segment.groupby('sales_segment')['TotalPrice'].sum().reset_index()
sales_segment_total

In [None]:
plt.figure(figsize=(15,6));
sns.countplot(x='sales_segment', order=['very low','low', 'medium','high'], data=df_sales_segment)

In [None]:
trainData.info()
testData.info()

In [None]:
# Now, lets do modeling for prediction
# Based on supervised learning plus classification and regression, 
# we narrow down our choice of models to a few. These include:
#    Logistic Regression
#    KNN or k-Nearest Neighbors
#    Support Vector Machines
#    Naive Bayes classifier
#    Decision Tree
#    Random Forrest
#    Perceptron
#    Artificial neural network
#    RVM or Relevance Vector Machine

# X_train = trainData.drop("Survived", axis=1)
# Y_train = trainData["Survived"]
# X_test  = testData.drop("PassengerId", axis=1).copy()
# X_train.shape, Y_train.shape, X_test.shape

In [None]:
trainData.info()