In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 81)

In [2]:
input_file = "../data/Kickstarter.pkl"
df_kickstarter = pd.read_pickle(input_file)

**Show head**

In [4]:
df_kickstarter.head()

Unnamed: 0,slug,name,blurb,category_lvl1,category_lvl2,country,created_at,deadline,launched_at,usd_goal,usd_pledged,state,state_changed_at,backers_count,user_id,project_url
0,louli-a-lecole,Louli à l’école,Un livre enfant pour l’apprentissage des émoti...,publishing,children's books,FR,2019-06-26 13:14:09,2019-08-25 14:46:37,2019-06-26 14:46:37,684.396336,719.756813,successful,2019-08-25 14:46:37,31,469036700,https://www.kickstarter.com/projects/469036700...
1,strange-wit-an-original-graphic-novel-about-ja...,"Strange Wit, an original graphic novel about J...","The true biography of the historical figure, w...",comics,graphic novels,US,2015-08-05 02:11:53,2015-09-14 04:19:27,2015-08-15 04:19:27,12000.0,14740.63,successful,2015-09-14 04:19:28,403,1695121020,https://www.kickstarter.com/projects/katyrex/s...
2,fam-find-a-motive-mobile-app,FAM - FIND A MOTIVE MOBILE APP,FAM is the new mobile app which combines event...,technology,apps,GB,2018-06-07 19:53:22,2018-08-18 15:43:54,2018-06-19 15:43:54,132713.121,14.598443,failed,2018-08-18 15:43:54,2,67455724,https://www.kickstarter.com/projects/findamoti...
3,destiny-ny-a-graphic-novel,"Destiny, NY - FINAL HOURS!",A graphic novel about two magical ladies in love.,comics,graphic novels,US,2016-09-06 08:33:56,2016-11-03 00:00:00,2016-10-04 07:22:10,20000.0,21799.0,successful,2016-11-03 00:00:00,406,248241887,https://www.kickstarter.com/projects/patshand/...
4,publishing-magus-magazine,Publishing Magus Magazine,We are publishing a magazine that focuses on t...,publishing,periodicals,US,2011-07-15 03:55:33,2011-10-04 17:04:28,2011-09-04 17:04:28,5000.0,10.0,failed,2011-10-04 17:04:28,1,1345074053,https://www.kickstarter.com/projects/134507405...


**Find the total number of rows and columns with the shape attribute**

In [20]:
df_kickstarter.shape

(1269029, 16)

**Show data types**

In [21]:
df_kickstarter.dtypes

slug                        object
name                        object
blurb                       object
category_lvl1               object
category_lvl2               object
country                     object
created_at          datetime64[ns]
deadline            datetime64[ns]
launched_at         datetime64[ns]
usd_goal                   float64
usd_pledged                float64
state                       object
state_changed_at    datetime64[ns]
backers_count                int64
user_id                     object
project_url                 object
dtype: object

**Show # of missing values per column**

In [22]:
df_kickstarter.isna().sum()

slug                    0
name                    0
blurb                  48
category_lvl1           0
category_lvl2       51339
country                 0
created_at              0
deadline                0
launched_at             0
usd_goal                0
usd_pledged             0
state                   0
state_changed_at        0
backers_count           0
user_id              3466
project_url             0
dtype: int64

In [23]:
df_kickstarter.isna().describe()

Unnamed: 0,slug,name,blurb,category_lvl1,category_lvl2,country,created_at,deadline,launched_at,usd_goal,usd_pledged,state,state_changed_at,backers_count,user_id,project_url
count,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029
unique,1,1,2,1,2,1,1,1,1,1,1,1,1,1,2,1
top,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
freq,1269029,1269029,1268981,1269029,1217690,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1269029,1265563,1269029


**Show only columns with missing values**

In [26]:
null_columns = df_kickstarter.isna().sum()
null_columns[null_columns.gt(0)]

blurb               48
category_lvl2    51339
user_id           3466
dtype: int64

**Calculating percentage of missing values**

In [28]:
null_prop = null_columns[null_columns.gt(0)] / len(df_kickstarter)
null_columns[null_columns.gt(0)] / len(df_kickstarter)

blurb            0.000038
category_lvl2    0.040455
user_id          0.002731
dtype: float64

## Analysis of Categories (lvl1/lvl2)

**Show absolute/relativ value counts**

In [33]:
#lvl1 - absolute value counts
df_kickstarter.category_lvl1.value_counts()

film & video    166097
music           161979
technology      129804
art             125630
publishing      124447
food             98273
games            82587
fashion          74782
comics           54126
design           52022
photography      50405
crafts           45101
theater          43140
journalism       35927
dance            24709
Name: category_lvl1, dtype: int64

In [32]:
#lvl1 - relative value counts
df_kickstarter.category_lvl1.value_counts(normalize=True)

film & video    0.130885
music           0.127640
technology      0.102286
art             0.098997
publishing      0.098065
food            0.077440
games           0.065079
fashion         0.058929
comics          0.042652
design          0.040994
photography     0.039719
crafts          0.035540
theater         0.033994
journalism      0.028311
dance           0.019471
Name: category_lvl1, dtype: float64

In [35]:
#lvl2 - absolute value counts
df_kickstarter.category_lvl2.value_counts()

web                27811
product design     24413
tabletop games     24279
accessories        21831
comic books        21215
                   ...  
quilts               662
letterpress          355
chiptune             290
social practice      134
taxidermy            110
Name: category_lvl2, Length: 145, dtype: int64

In [36]:
#lvl2 - relative value counts
df_kickstarter.category_lvl2.value_counts(normalize=True)

web                0.022839
product design     0.020049
tabletop games     0.019939
accessories        0.017928
comic books        0.017422
                     ...   
quilts             0.000544
letterpress        0.000292
chiptune           0.000238
social practice    0.000110
taxidermy          0.000090
Name: category_lvl2, Length: 145, dtype: float64

In [17]:
df_kickstarter.groupby(['category_lvl1']).count()

Unnamed: 0_level_0,slug,name,blurb,category_lvl2,country,created_at,deadline,launched_at,usd_goal,usd_pledged,state,state_changed_at,backers_count,user_id,project_url
category_lvl1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
art,125630,125630,125612,119514,125630,125630,125630,125630,125630,125630,125630,125630,125630,125314,125630
comics,54126,54126,54126,52097,54126,54126,54126,54126,54126,54126,54126,54126,54126,53932,54126
crafts,45101,45101,45095,37944,45101,45101,45101,45101,45101,45101,45101,45101,45101,45025,45101
dance,24709,24709,24709,16147,24709,24709,24709,24709,24709,24709,24709,24709,24709,24613,24709
design,52022,52022,52022,50768,52022,52022,52022,52022,52022,52022,52022,52022,52022,51928,52022
fashion,74782,74782,74776,73621,74782,74782,74782,74782,74782,74782,74782,74782,74782,74636,74782
film & video,166097,166097,166097,164515,166097,166097,166097,166097,166097,166097,166097,166097,166097,165595,166097
food,98273,98273,98267,95379,98273,98273,98273,98273,98273,98273,98273,98273,98273,98007,98273
games,82587,82587,82587,82021,82587,82587,82587,82587,82587,82587,82587,82587,82587,82306,82587
journalism,35927,35927,35927,31581,35927,35927,35927,35927,35927,35927,35927,35927,35927,35795,35927


In [56]:
df_kickstarter[['category_lvl1','category_lvl2']].groupby(['category_lvl1','category_lvl2']).count()

category_lvl1,category_lvl2
art,ceramics
art,conceptual art
art,digital art
art,illustration
art,installations
...,...
theater,festivals
theater,immersive
theater,musical
theater,plays


In [64]:
df_kickstarter.groupby(['category_lvl1']).describe()

Unnamed: 0_level_0,usd_goal,usd_goal,usd_goal,usd_goal,usd_goal,usd_goal,usd_goal,usd_goal,usd_pledged,usd_pledged,usd_pledged,usd_pledged,usd_pledged,usd_pledged,usd_pledged,usd_pledged,backers_count,backers_count,backers_count,backers_count,backers_count,backers_count,backers_count,backers_count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
category_lvl1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
art,125630.0,29806.739584,1096508.0,0.01,600.0,2100.0,6000.0,100000000.0,125630.0,4187.7412,25743.506722,0.0,103.0,817.0,3228.309883,1924018.0,125630.0,57.56348,260.191623,0.0,3.0,19.0,54.0,22840.0
comics,54126.0,6365.394218,20617.56,0.511428,1000.0,2700.0,6000.0,1000000.0,54126.0,9034.325066,24257.830259,0.0,1045.0,2947.858225,7082.0,665725.9,54126.0,189.566438,367.302487,0.0,33.0,82.0,188.0,7199.0
crafts,45101.0,8779.383529,119151.9,1.0,500.0,1775.347524,5422.917885,10000000.0,45101.0,3042.292861,10668.101436,0.0,25.0,327.270581,2002.0,454717.5,45101.0,45.898007,190.099472,0.0,2.0,9.0,37.0,12705.0
dance,24709.0,7732.044572,42505.93,5.0,1800.0,3500.0,6500.0,2000000.0,24709.0,4536.244355,6380.06346,0.0,1050.0,2888.0,5525.0,146076.0,24709.0,53.569064,88.586481,0.0,17.0,37.0,67.0,4133.0
design,52022.0,59198.88533,1444507.0,0.793289,1500.0,5222.81536,15000.0,100000000.0,52022.0,37985.750126,221709.902182,0.0,300.701072,3346.0,19133.0,12143440.0,52022.0,342.437988,1311.502158,0.0,8.0,49.0,194.0,38443.0
fashion,74782.0,13791.299831,180231.7,0.764666,744.41429,4000.0,11658.2591,17917507.8,74782.0,11854.707246,71027.669859,0.0,226.602258,1474.0,8156.103134,3948547.0,74782.0,118.028884,499.663011,0.0,6.0,31.0,93.0,21409.0
film & video,166097.0,109096.320117,2251055.0,0.766114,2000.0,5665.5747,17630.6364,152350076.0,166097.0,10471.951723,113224.613449,0.0,101.0,1793.0,7632.0,11385450.0,166097.0,109.42421,1079.362355,0.0,3.0,24.0,76.0,91585.0
food,98273.0,47295.036228,929596.1,1.0,4000.0,10000.0,25000.0,102344508.0,98273.0,6660.24701,28546.190763,0.0,20.0,401.37123,5580.0,1927217.0,98273.0,67.992633,219.522045,0.0,2.0,8.0,59.0,10308.0
games,82587.0,32250.480083,873716.0,0.761533,1447.703387,5000.0,15000.0,100000000.0,82587.0,31879.955136,188567.212431,0.0,282.379863,3095.093893,13498.843434,8596475.0,82587.0,503.1492,2220.290179,0.0,9.0,72.0,305.0,73206.0
journalism,35927.0,78497.842608,2257801.0,1.0,1488.62024,5000.0,13197.49875,100000000.0,35927.0,4212.782231,17185.592134,0.0,1.311388,100.0,1950.742128,706489.2,35927.0,60.519943,218.441064,0.0,1.0,4.0,32.0,3143.0


In [63]:
df_kickstarter.groupby(['category_lvl1']).describe().transpose()

Unnamed: 0,category_lvl1,art,comics,crafts,dance,design,fashion,film & video,food,games,journalism,music,photography,publishing,technology,theater
usd_goal,count,125630.0,54126.0,45101.0,24709.0,52022.0,74782.0,166097.0,98273.0,82587.0,35927.0,161979.0,50405.0,124447.0,129804.0,43140.0
usd_goal,mean,29806.74,6365.394218,8779.384,7732.045,59198.89,13791.3,109096.3,47295.04,32250.48,78497.84,11208.88,11835.07,12863.23,72031.53,32658.9
usd_goal,std,1096508.0,20617.557751,119151.9,42505.93,1444507.0,180231.7,2251055.0,929596.1,873716.0,2257801.0,136859.4,112198.6,285621.2,1128264.0,650837.7
usd_goal,min,0.01,0.511428,1.0,5.0,0.7932894,0.7646657,0.7661145,1.0,0.7615334,1.0,0.6436,1.0,1.0,0.7022768,1.0
usd_goal,25%,600.0,1000.0,500.0,1800.0,1500.0,744.4143,2000.0,4000.0,1447.703,1488.62,1672.044,1200.0,1200.0,5000.0,1458.396
usd_goal,50%,2100.0,2700.0,1775.348,3500.0,5222.815,4000.0,5665.575,10000.0,5000.0,5000.0,4000.0,4000.0,3500.0,15000.0,3301.088
usd_goal,75%,6000.0,6000.0,5422.918,6500.0,15000.0,11658.26,17630.64,25000.0,15000.0,13197.5,8000.0,10000.0,8000.0,44749.37,7771.976
usd_goal,max,100000000.0,1000000.0,10000000.0,2000000.0,100000000.0,17917510.0,152350100.0,102344500.0,100000000.0,100000000.0,21000000.0,7300000.0,25000000.0,100000000.0,40000000.0
usd_pledged,count,125630.0,54126.0,45101.0,24709.0,52022.0,74782.0,166097.0,98273.0,82587.0,35927.0,161979.0,50405.0,124447.0,129804.0,43140.0
usd_pledged,mean,4187.741,9034.325066,3042.293,4536.244,37985.75,11854.71,10471.95,6660.247,31879.96,4212.782,5228.147,6603.051,6927.266,39217.71,5027.712
