In [97]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn import preprocessing
import tensorflow as tf
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

In [98]:
# Import and read the file:
df = pd.read_csv("resources/cleaned_superstore_data.csv", index_col=[0])
df.head(3)

Unnamed: 0,Order_Date,Ship_Date,Ship_Mode,Segment,City,State,Country,Region,Market,Category,Sub_Category,Product_Name,Sales,Quantity,Discount,Profit,Shipping_Cost,Order Priority
0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,Phones,Samsung Convoy 3,221.98,2,0.0,62.15,40.77,High
1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.4,9,0.1,-288.77,923.63,Critical
2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.17,9,0.1,919.97,915.49,Medium


In [99]:
# create a group using groupby for country
countries = df.groupby("Country")
# size of group to count observations
countries = countries.size()
# make a column name 
country_df = countries.reset_index(name='Observations')
country_df

Unnamed: 0,Country,Observations
0,Afghanistan,55
1,Albania,16
2,Algeria,196
3,Angola,122
4,Argentina,390
...,...,...
160,Vietnam,263
161,Western Sahara,2
162,Yemen,30
163,Zambia,102


In [100]:
# sort by top 20 country observations
country_df = country_df.sort_values('Observations', ascending=False).head(20)
country_df.reset_index(drop=True, inplace=True)

In [101]:
# check new dataframe
country_df.head(20)

Unnamed: 0,Country,Observations
0,United States,9994
1,Australia,2837
2,France,2827
3,Mexico,2635
4,Germany,2063
5,China,1880
6,United Kingdom,1633
7,Brazil,1593
8,India,1554
9,Indonesia,1390


In [102]:
# make the new country csv into resources folder
country_df.to_csv("resources/country_superstore_observations.csv")

In [103]:
# create a group using groupby for city
cities = df.groupby("City")
# size of group to count observations
cities = cities.size()
# make a column name 
cities_df = cities.reset_index(name='Observations')
cities_df

Unnamed: 0,City,Observations
0,Aachen,17
1,Aalen,1
2,Aalst,4
3,Aba,25
4,Abadan,11
...,...,...
3645,Zwedru,1
3646,Zwickau,3
3647,Zwolle,2
3648,eMbalenhle,2


In [104]:
# sort observations per city to top 20
cities_df = cities_df.sort_values('Observations', ascending=False).head(20)
cities_df.reset_index(drop=True, inplace=True)

In [105]:
# check new dataframe
cities_df.head(20)

Unnamed: 0,City,Observations
0,New York City,915
1,Los Angeles,747
2,Philadelphia,537
3,San Francisco,510
4,Santo Domingo,443
5,Manila,432
6,Seattle,428
7,Houston,377
8,Tegucigalpa,362
9,Jakarta,337


In [106]:
# save csv of cities observations
cities_df.to_csv("resources/cities_superstore_observations.csv")

In [107]:
# create a group using groupby for city
states = df.groupby("State")
# size of group to count observations
states = states.size()
states_df = states.reset_index(name='Observations')
states_df

Unnamed: 0,State,Observations
0,'Ajman,4
1,'Amman,40
2,'Asir,8
3,Abia,25
4,Abruzzi,18
...,...,...
1097,Zinder,4
1098,Zulia,36
1099,Zürich,34
1100,Šiauliai,4


In [108]:
# sort observations per state to top 20
states_df = cities_df.sort_values('Observations', ascending=False).head(20)
states_df.reset_index(drop=True, inplace=True)

In [109]:
# check new dataframe
states_df.head(20)

Unnamed: 0,City,Observations
0,New York City,915
1,Los Angeles,747
2,Philadelphia,537
3,San Francisco,510
4,Santo Domingo,443
5,Manila,432
6,Seattle,428
7,Houston,377
8,Tegucigalpa,362
9,Jakarta,337


In [110]:
# create csv of cities observations
states_df.to_csv("resources/states_superstore_observations.csv")

In [112]:
# create a group using groupby for region
regions = df.groupby("Region")
# size of group to count observations
regions = regions.size()
regions_df = regions.reset_index(name='Observations')
regions_df

Unnamed: 0,Region,Observations
0,Canada,384
1,Caribbean,1690
2,Central Africa,643
3,Central America,5616
4,Central Asia,217
5,Central US,2323
6,Eastern Africa,728
7,Eastern Asia,2374
8,Eastern Europe,1529
9,Eastern US,2848


In [113]:
# sort observations per region to top 20
regions_df = regions_df.sort_values('Observations', ascending=False).head(20)
regions_df.reset_index(drop=True, inplace=True)

In [114]:
# check new dataframe
regions_df.head(20)

Unnamed: 0,Region,Observations
0,Western Europe,5883
1,Central America,5616
2,Oceania,3487
3,Western US,3203
4,Southeastern Asia,3129
5,South America,2988
6,Eastern US,2848
7,Southern Asia,2655
8,Western Asia,2440
9,Eastern Asia,2374


In [115]:
# create csv of regions observations
regions_df.to_csv("resources/region_superstore_observations.csv")

In [116]:
# create a group using groupby for market
market = df.groupby("Market")
# size of group to count observations
market = market.size()
market_df = market.reset_index(name='Observations')
market_df

Unnamed: 0,Market,Observations
0,Africa,4587
1,Asia Pacific,14302
2,Europe,11729
3,LATAM,10294
4,USCA,10378


In [117]:
# sort observations per region to top 5(only 5 markets)
market_df = market_df.sort_values('Observations', ascending=False).head(20)
market_df.reset_index(drop=True, inplace=True)

In [119]:
# check new dataframe
market_df.head()

Unnamed: 0,Market,Observations
0,Asia Pacific,14302
1,Europe,11729
2,USCA,10378
3,LATAM,10294
4,Africa,4587


In [120]:
# save new csv to resources folder
market_df.to_csv("resources/market_superstore_observations.csv")