In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

In [2]:
ecomm_purchases = pd.read_csv("C:\\Users\\Karthik.Iyer\\Downloads\\AccelerateAI\\Python Programming\\EcommercePurchases.csv")
ecomm_purchases.head()

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6011930000000000.0,Feb-20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14
1,"9374 Jasmine Spurs Suite 508\nSouth John, TN 8...",28 rn,PM,Opera/8.93.(Windows 98; Win 9x 4.90; en-US) Pr...,"Fletcher, Richards and Whitaker",3337760000000000.0,Nov-18,561,Mastercard,anthony41@reed.com,Drilling engineer,15.160.41.51,fr,70.73
2,Unit 0065 Box 5052\nDPO AP 27450,94 vE,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"Simpson, Williams and Pham",675958000000.0,Aug-19,699,JCB 16 digit,amymiller@morales-harrison.com,Customer service manager,132.207.160.22,de,0.95
3,"7780 Julia Fords\nNew Stacy, WA 45798",36 vm,PM,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0 ...,"Williams, Marshall and Buchanan",6011580000000000.0,Feb-24,384,Discover,brent16@olson-robinson.info,Drilling engineer,30.250.74.19,es,78.04
4,"23012 Munoz Drive Suite 337\nNew Cynthia, TX 5...",20 IE,AM,Opera/9.58.(X11; Linux x86_64; it-IT) Presto/2...,"Brown, Watson and Andrews",6011460000000000.0,Oct-25,678,Diners Club / Carte Blanche,christopherwright@gmail.com,Fine artist,24.140.33.94,es,77.82


In [3]:
#Lets check data types
ecomm_purchases.dtypes

Address              object
Lot                  object
AM or PM             object
Browser Info         object
Company              object
Credit Card         float64
CC Exp Date          object
CC Security Code      int64
CC Provider          object
Email                object
Job                  object
IP Address           object
Language             object
Purchase Price      float64
dtype: object

The above data types look appropriate in the first look. We can look at specific cases later if any.

In [4]:
#Any missing values
ecomm_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Address           10000 non-null  object 
 1   Lot               10000 non-null  object 
 2   AM or PM          10000 non-null  object 
 3   Browser Info      10000 non-null  object 
 4   Company           10000 non-null  object 
 5   Credit Card       10000 non-null  float64
 6   CC Exp Date       10000 non-null  object 
 7   CC Security Code  10000 non-null  int64  
 8   CC Provider       10000 non-null  object 
 9   Email             10000 non-null  object 
 10  Job               10000 non-null  object 
 11  IP Address        10000 non-null  object 
 12  Language          10000 non-null  object 
 13  Purchase Price    10000 non-null  float64
dtypes: float64(2), int64(1), object(11)
memory usage: 1.1+ MB


In [5]:
#Any missing values
ecomm_purchases.isnull().sum()

Address             0
Lot                 0
AM or PM            0
Browser Info        0
Company             0
Credit Card         0
CC Exp Date         0
CC Security Code    0
CC Provider         0
Email               0
Job                 0
IP Address          0
Language            0
Purchase Price      0
dtype: int64

There are no missing values in the given dataset

In [6]:
#Lets check total rows and columns
print(ecomm_purchases.shape)
print("There are 10000 rows and 14 columns")

(10000, 14)
There are 10000 rows and 14 columns


In [7]:
#Maximum and minimum Price
print("Minimum Purchase Price:", ecomm_purchases["Purchase Price"].min())
print("Maximum Purchase Price:", ecomm_purchases["Purchase Price"].max())

Minimum Purchase Price: 0.0
Maximum Purchase Price: 99.99


In [8]:
#How has the average price been trending?
print("Average Purchase Price:", ecomm_purchases["Purchase Price"].mean())

Average Purchase Price: 50.34730200000025


In [9]:
#No of people who have 'en' as their language of choice
ecomm_purchases["Language"].value_counts()

de    1155
ru    1155
el    1137
pt    1118
en    1098
fr    1097
es    1095
it    1086
zh    1059
Name: Language, dtype: int64

In [10]:
#No of people who have 'en' as their language of choice
len(ecomm_purchases[ecomm_purchases["Language"]=='en'])

1098

In [11]:
#Most popular Credit card provider
ecomm_purchases["CC Provider"].value_counts().idxmax()

'JCB 16 digit'

In [12]:
#No of people purchasing in AM vs PM
ecomm_purchases["AM or PM"].value_counts()

PM    5068
AM    4932
Name: AM or PM, dtype: int64

More purchasers preferred buying after noon (i.e. PM) over morning (i.e. AM)

In [13]:
#No of American Express cards expiring in Year 2025
ecomm_purchases["CC Provider"].value_counts()

JCB 16 digit                   1716
VISA 16 digit                  1715
JCB 15 digit                    868
American Express                849
Maestro                         846
Voyager                         829
Discover                        817
Mastercard                      816
VISA 13 digit                   777
Diners Club / Carte Blanche     767
Name: CC Provider, dtype: int64

In [14]:
#No of American Express cards expiring in Year 2025
len(ecomm_purchases[(ecomm_purchases["CC Provider"]=="American Express") & (ecomm_purchases["CC Exp Date"].apply(lambda x:x.split('-')[1]) == '25')])

70

In [15]:
#Top 5 Job titles who purchased in 2020

The only date field we have in the dataset is CC Exp Date which is Credit Card Expiry Date. Let's look at if there are purchases made when credit cards are expiring in 2020

In [16]:
#Purchases in the Year 2020
ecomm_purchases[ecomm_purchases["CC Exp Date"].apply(lambda x:x.split('-')[1]) == '20']

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6.011930e+15,Feb-20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14
19,"125 Hall Summit\nBoothton, IL 41721",99 CU,PM,Mozilla/5.0 (compatible; MSIE 7.0; Windows NT ...,Turner-Mckinney,6.763440e+11,Feb-20,440,VISA 16 digit,ruiznicole@gmail.com,"Designer, interior/spatial",25.105.209.214,fr,58.39
32,Unit 3628 Box 6778\nDPO AE 72362,39 Qm,PM,Mozilla/5.0 (Windows 98; Win 9x 4.90) AppleWeb...,Martinez-Wilson,4.942280e+15,Jan-20,8360,JCB 16 digit,shane21@atkinson.com,Civil Service fast streamer,196.37.134.217,pt,56.63
36,"9374 Skinner Common Apt. 254\nChristopherfort,...",80 Fq,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Hanna-Grant,1.800420e+14,Sep-20,912,VISA 16 digit,saundersernest@walsh.com,Animal technologist,85.134.58.250,zh,9.77
38,"9671 Riley Drives Apt. 746\nPort Davidtown, TN...",15 vj,AM,Mozilla/5.0 (X11; Linux i686; rv:1.9.6.20) Gec...,"Bryant, Hubbard and Gonzales",2.100950e+14,Dec-20,248,Voyager,djennings@boyd-english.org,Music therapist,143.138.65.219,en,30.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9883,"751 Petersen Groves\nLake Rebecca, NM 55875",61 tE,PM,Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_8_7...,Perez and Sons,4.939810e+15,May-20,238,Mastercard,lisa25@hotmail.com,Therapeutic radiographer,215.85.159.70,pt,67.47
9901,"908 Norman Burgs\nPhillipsberg, VT 92023-9956",90 Ng,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Hughes Inc,3.014880e+13,Sep-20,483,Voyager,james32@gmail.com,Meteorologist,250.66.85.29,en,57.25
9911,"44320 Gray Centers Suite 325\nJenniferfurt, OK...",19 Bf,PM,Mozilla/5.0 (X11; Linux i686; rv:1.9.7.20) Gec...,Jones-Gray,4.058070e+12,Dec-20,921,Diners Club / Carte Blanche,hensonlindsay@graves-fox.com,"Optician, dispensing",84.118.30.56,en,20.63
9957,"73556 Chase Pine\nBrucemouth, MP 73473",25 rU,PM,Opera/9.49.(Windows 95; en-US) Presto/2.9.189 ...,"Martin, Clayton and Jarvis",3.337740e+15,May-20,798,VISA 13 digit,latashaochoa@gmail.com,Therapeutic radiographer,239.219.241.162,ru,71.85


We can see that there are purchases made in the year 2020

In [17]:
#Top 5 jobs
ecomm_purchases["Job"][ecomm_purchases["CC Exp Date"].apply(lambda x:x.split('-')[1]) == '20'].value_counts().head()

Designer, fashion/clothing    6
Therapist, art                6
Aid worker                    6
Risk analyst                  6
Forest/woodland manager       6
Name: Job, dtype: int64

In [18]:
#Split of Mozilla to Opera browser
ecomm_purchases["Browser Info"].apply(lambda x:x.split('/')[0]).value_counts()

Mozilla    7924
Opera      2076
Name: Browser Info, dtype: int64

More Purchasers preferred using Mozilla browser over Opera.

In [19]:
#Top 10 first level IPv4 address ( IPv4: ABC.DEF.GHI.KLM, Top Level = ABC)
ecomm_purchases["IP Address"].apply(lambda x:x.split('.')[0]).value_counts().head(10)

197    57
25     56
89     56
148    55
102    53
56     53
108    52
98     52
40     51
156    51
Name: IP Address, dtype: int64

In [20]:
#The top 5 email providers of clients (e.g. gmail, yahoo etc.)
ecomm_purchases["Email"].apply(lambda x: x.split('@')[1]).value_counts().head()

hotmail.com     1638
yahoo.com       1616
gmail.com       1605
smith.com         42
williams.com      37
Name: Email, dtype: int64