## Libraries / Setup

In [2]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Init Dataframe

In [4]:
df = pd.read_csv("starwars_ebay/starwars_ebay.csv")
df.head()

Unnamed: 0,product_name,price,country,condition,shipping_cost,demand_status,url
0,🔥Star Wars Vintage Collection ARC Commander Ha...,$32.90,from Malaysia,Brand New,+$18.00 shipping,Last one,https://www.ebay.com/itm/144937682104?hash=ite...
1,Star Wars Vintage Collection Mandalorian Super...,$19.99,from United States,Brand New,+$13.35 shipping,85 sold,https://www.ebay.com/itm/394251731939?epid=190...
2,Star Wars Vintage Collection Dark Trooper (The...,$28.99,from United States,Brand New,+$18.70 shipping,71 watchers,https://www.ebay.com/itm/125700558379?hash=ite...
3,Star Wars Vintage Collection Deathwatch Mandal...,$15.49,from United States,Brand New,+$13.28 shipping,Almost gone,https://www.ebay.com/itm/125636022875?hash=ite...
4,STAR WARS Vintage Collection VC34 Jango Fett A...,$141.50,from United States,Brand New,+$25.17 shipping,,https://www.ebay.com/itm/295529080278?hash=ite...


### Fields/Columns

In [5]:
df.columns

Index(['product_name', 'price', 'country', 'condition', 'shipping_cost',
       'demand_status', 'url'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10020 entries, 0 to 10019
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_name   10020 non-null  object
 1   price          10020 non-null  object
 2   country        10020 non-null  object
 3   condition      10020 non-null  object
 4   shipping_cost  10019 non-null  object
 5   demand_status  1842 non-null   object
 6   url            10020 non-null  object
dtypes: object(7)
memory usage: 548.1+ KB


### Null Values

In [7]:
df.isna().sum()

product_name        0
price               0
country             0
condition           0
shipping_cost       1
demand_status    8178
url                 0
dtype: int64

In [8]:
n_df = len(df["product_name"])
print(f"Total records in df: {n_df}")

Total records in df: 10020


In [9]:
demand_status_nan_perc = df["demand_status"].isna().sum() / n_df
print(f"demand_status has the highest NaNs in dataset ({round(100*demand_status_nan_perc, 2)}%)")

demand_status has the highest NaNs in dataset (81.62%)


_We can see that **demand_status** has the highest NaNs in the dataset but we'll leave it like that to see if we can get some extra insights for when we do have demand statuses available_

### Field check & cleanup

### Categoricals

#### Country

In [10]:
# check if all columns have the "from" str
country_len = len(df["country"])
print(f"total records in country: {country_len}")

total records in country: 10020


In [12]:
from_counter = 0
for index, row in df.iterrows():
    if "from" in row["country"]:
        from_counter += 1

print(f"total rows with 'from' string in country columb: {from_counter}")

total rows with 'from' string in country columb: 10020


In [14]:
df["country"] = df["country"].map(lambda x: x.strip("from "))
df.head()

Unnamed: 0,product_name,price,country,condition,shipping_cost,demand_status,url
0,🔥Star Wars Vintage Collection ARC Commander Ha...,$32.90,Malaysia,Brand New,+$18.00 shipping,Last one,https://www.ebay.com/itm/144937682104?hash=ite...
1,Star Wars Vintage Collection Mandalorian Super...,$19.99,United States,Brand New,+$13.35 shipping,85 sold,https://www.ebay.com/itm/394251731939?epid=190...
2,Star Wars Vintage Collection Dark Trooper (The...,$28.99,United States,Brand New,+$18.70 shipping,71 watchers,https://www.ebay.com/itm/125700558379?hash=ite...
3,Star Wars Vintage Collection Deathwatch Mandal...,$15.49,United States,Brand New,+$13.28 shipping,Almost gone,https://www.ebay.com/itm/125636022875?hash=ite...
4,STAR WARS Vintage Collection VC34 Jango Fett A...,$141.50,United States,Brand New,+$25.17 shipping,,https://www.ebay.com/itm/295529080278?hash=ite...


In [15]:
country_freqs = df.country.value_counts()
df_country_freqs = pd.DataFrame(country_freqs).reset_index()
df_country_freqs

Unnamed: 0,index,country
0,United States,9031
1,Australia,429
2,Taiwan,208
3,Canada,166
4,Japan,59
5,Malaysia,28
6,United Kingd,23
7,Greece,13
8,Singapore,12
9,Hong Kong,11


#### Condition

In [11]:
# get frequency of conditions
condition_freqs = df.condition.value_counts()
df_condition_freqs = pd.DataFrame(condition_freqs).reset_index()
df_condition_freqs

Unnamed: 0,index,condition
0,Brand New,8671
1,Pre-Owned,1349


### Numericals

#### Price

#### Shipping Cost