In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/kickstarter-campaigns/Kickstarter_projects_Feb19.csv')

In [None]:
df.head()

In [None]:
df.info()

# Data Cleaning

In [None]:
# first and foremost is to check whether there are any null values
df.isnull().sum()

In [None]:
# Looks like our dataset has no null values

In [None]:
#next step would be to check if our dataset has any duplicated values

#The first thing to check for duplication would if the ids and campaign names are repeated again and again

In [None]:
dup_id = df['id'].duplicated() == True

In [None]:
dup_id.sum()

In [None]:
#looks like we entries that have duplicated ids,which mean they are repeated

In [None]:
#lets check using campaign names 

In [None]:
dup_names = df['name'].duplicated() == True

In [None]:
dup_names.sum()

In [None]:
#So we also have duplicate entries based of names 

In [None]:
#But while considering both names and ids - id is the considered unique feature for all campaigns whereas names could be same with different id

# So I'll remove the entries with duplicate ids

In [None]:
df.drop_duplicates(subset='id',keep='first',inplace=True)

In [None]:
df.info()

In [None]:
df[df['id'].duplicated() == True] # all the duplicated entries have been removed

In [None]:
df[df['name'].duplicated() == True]

In [None]:
# We can see that we have 514 records that have duplicate names as of other campaigns

# Lets us confirm that though they have saame name they have different id. 

In [None]:
df[df['name'] == 'Animal Crossing Enamel Pins']

In [None]:
df[df['name'] == 'XOX']

### Thus we have removed the row with duplicate id's. Though we can still see that there are campaigns that have same names. But in that case we can see that they have different Ids, which tells us that the the process of naming the campaigns as reatively unique has been not considered. But as of now we can say that we have unique records and maybe the campaigns with same name have been inspired by the the campaign from which they took the same name.

In [None]:
df.info()

In [None]:
# The now dates that we have are having time info also, we can convert it to just the date and then convert it to datetime format

In [None]:
df["launched_at"]= df["launched_at"].str.split(" ", n = 1, expand = True)
df["deadline"]= df["deadline"].str.split(" ", n = 1, expand = True) 

In [None]:
df.head()

In [None]:
# Lets change the format to datetime
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched_at'] = pd.to_datetime(df['launched_at'])

In [None]:
# Another thing here is that we can seperate out the date and extract year and month which will help us in analysis

In [None]:
# Also we can have a seperate column telling us the length of the campaigns in days which will help us in analysis

In [None]:
#df['deadline'] - df['launched_at']
df['campaign span days'] = (df['deadline'] - df['launched_at']).dt.days

In [None]:
df.head()

In [None]:
df['launched_at'].dt.year

In [None]:
df['launched year'] = df['launched_at'].dt.year

In [None]:
df.head()

In [None]:
df['goal_usd'] =  round(df['goal_usd'],2)
df['usd_pledged'] = round(df['usd_pledged'],2)

In [None]:
df.head()

In [None]:
df.shape

## Now we kind of have a proper cleaned dataset with no null values and duplicated values and now we can start analysing our data and visualizing those results

In [None]:
main_cat_values = df['main_category'].value_counts()
main_cat_values

In [None]:
#Lets see the counts of main category of projects

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(9,6),dpi=100)
main_cat_values = df['main_category'].value_counts()

sns.barplot(data=df,x=main_cat_values.values,y=main_cat_values.index)

plt.xlabel('Count')
plt.ylabel('Main Campaign category')
plt.title('Campaign - Main category');

# Music seems to the most popular Kickstarter campaign

In [None]:
#Lets have a look on the sub categories also

In [None]:
sub_cat = df['sub_category'].value_counts()
sub_cat = sub_cat.nlargest(15)

In [None]:
plt.figure(figsize=(9,6),dpi=100)

sns.barplot(x=sub_cat.values,y=sub_cat.index)

In [None]:
df[df['sub_category'] == 'Web']

#### Web sub category form technology turn out to be the one with most no kicstarter campaign with 3600 campaigns

In [None]:
df.head()

In [None]:
df['launched year'].value_counts()

In [None]:
# lets have a look at the country distribution

countries = df['country'].value_counts()
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(y=countries.values,x=countries.index)

print('The country having the most no of kickstarter campaings is',countries.index[0],'with',countries.values[0],'campaigns')

In [None]:
#lets have a look on the currency distribution
curr = df['currency'].value_counts()
plt.figure(figsize=(9,7),dpi=90)
sns.barplot(y=curr.values,x=curr.index,palette='viridis')

print('The most used currency for pledging is:-',curr.index[0])

In [None]:
df.head()

In [None]:
#success rate of campaigns

result=df['status'].value_counts()
fig = plt.figure(dpi=120)
label = 'Successful','Failed'
sizes = result.values

plt.pie(sizes,labels = label,autopct='%1.1f%%');
plt.title('Success rate of Kickstarter campaings')

print('More than half of the projects have succeded in raising the funds for their projects')

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.countplot(data=df,y='main_category',hue='status')

### All over the world music is most successful category and food  & film&video category seems to be most failed. But the above graph cannot be taken into account because more that half of the data belongs to the country US, thus to have a lear view we will have to analyze US data and data of rest of the countries seperately

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.countplot(data = df, x='start_Q',hue='status')

plt.title('Successful and failed campaigns by category ')

plt.legend(bbox_to_anchor=(1.25,1))

### Visualizing by which quarter the campaigns were launched in all quarter the successful campaigns are more the failed ones 

In [None]:
#lets see the top 5 successful main categories which have been successful in raising the goal funding

success_projects = df[df['status'] == 'successful']


In [None]:
success_cat = success_projects.groupby('main_category')['usd_pledged'].sum()
success_cat.to_frame().reset_index().sort_values('usd_pledged', ascending = False).head(30).set_index('main_category')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
success_cat

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.barplot(y=success_cat.index,x=success_cat.values)

In [None]:
success_projects = df[df['status'] == 'successful']
success_cat = success_projects.groupby('main_category')['usd_pledged'].sum().to_frame().reset_index().sort_values('usd_pledged', ascending = False).head(30).set_index('main_category')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
success_cat

In [None]:
sux_cat = success_projects.groupby('main_category').sum()

In [None]:
sux_cat

###### Now we know among the category of successfull campaigns which category has most donations

In [None]:
success_projects.columns

# Let's see details about the successful campaigns of US

In [None]:
#success rate of campaigns
usa = df[df['country'] == 'US']
result=usa['status'].value_counts()
fig = plt.figure(dpi=120)
label = 'Successful','Failed'
sizes = result.values

plt.pie(sizes,labels = label,autopct='%1.1f%%');
plt.title('Success rate of Kickstarter campaings US')

print('In USA more than half of the projects have succeded in raising the funds for their projects')

In [None]:
us_success_ks = success_projects[success_projects['country'] == 'US']

In [None]:
us_success_ks.head()

In [None]:
goal = us_success_ks['goal_usd'].mean()
goal

In [None]:
achieved = us_success_ks['usd_pledged'].mean()
achieved

In [None]:
22118 / 9815

# Thus on an average out of all the successful projects that were based in US the campaigns raised "2.25" times of their goal usd

In [None]:
us_success_ks['launched year'].value_counts()

### Most of the successfull projects in US were launched in the 2018 & 2015

In [None]:
plt.figure(figsize=(9,7),dpi=80)

sns.countplot(data=us_success_ks,x='launched year')

plt.title('Launch year of Successful Kickstarter campaign in US');

In [None]:
us_success_ks['start_Q'].value_counts()

### The quarter in which the campaigns were launched does not seem to have much impact on the success of the campaigns

In [None]:
us_success_ks['start_month'].value_counts()

### Also the start month does not seem to have much impact on the success of the campaigns

In [None]:
# Most successful category of KS campaigns
plt.figure(figsize=(9,7),dpi=100)
us_s_cat = us_success_ks['main_category'].value_counts()
sns.barplot(x=us_s_cat.values,y=us_s_cat.index)

plt.xlabel('Count')
plt.ylabel('Campaign categories')
plt.title('The most successfull campaigns categories in US');

# Thus in US out of the KS campaigns of music category seems to most successful

In [None]:
us_s_cat.sort_values(ascending=False)

In [None]:
money_us_pledged = us_success_ks.groupby('main_category').sum()[['goal_usd','usd_pledged'] ]#figs in million dollars
pd.set_option('display.float_format', lambda x: '%.3f' % x)

money_us_pledged

In [None]:
plt.figure(figsize=(9,7),dpi=90)   

sns.barplot(data=money_us_pledged,y=money_us_pledged.index,x='goal_usd')

plt.title('Goal USD of the successful Kickstarter campaigns in US');


In [None]:
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(data=money_us_pledged,y=money_us_pledged.index,x='usd_pledged')

plt.title('USD pledged by successful Kickstarter campaigns in US')

## Though music happens to be the main category with most successful campaigns, Technology has racked in more donations so funding wise in US technology is most successful camapigns and music is the most successful category with higher number of successful campaigns.

## But success of any campaign if defined by whether it reached its funding goal so in that manner music remains the top successful category of KS campaigns in US.

## The reason technology being funded more can be because being the most developed country in the world, US has always been the parent of technology driven products, maybe that is why it is funded more

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
us_success_ks.groupby(['main_category','start_Q']).sum()[['goal_usd','usd_pledged'] ]

In [None]:
us_success_ks.groupby('main_category').mean()[['duration','campaign span days']]

In [None]:
us_success_ks.groupby('campaign span days').mean()

In [None]:
us_success_ks['city'].value_counts().sort_values(ascending=False)[0:5] #top 5 cities from where most campaigns were

### Los Angeles and New York have the most successful KS campaigns in US

## Now we know that Music was the most successful KS campaign category in US by no of successful campaigns, let's see the top sub category of Music

In [None]:
us_succ_ks_main_cat =  us_success_ks[us_success_ks['main_category'] == 'music']

In [None]:
us_succ_ks_sub_cat_val = us_succ_ks_main_cat['sub_category'].value_counts()
us_succ_ks_sub_cat_val

### Indie Rock is the most successful sub category of KS campaigns in Music in US

In [None]:
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(x=us_succ_ks_sub_cat_val.values,y=us_succ_ks_sub_cat_val.index)

plt.title('Sub category of most successful main category - Music campaigns in US')
plt.xlabel('Count')
plt.ylabel('Sub category');

In [None]:
us_succ_ks_sub_cat =  us_succ_ks_main_cat[us_succ_ks_main_cat['sub_category'] =='Indie Rock']

In [None]:
us_succ_ks_sub_cat.head()

In [None]:
top_subcat_money_us_pledged = us_succ_ks_main_cat.groupby('sub_category').sum()[['goal_usd','usd_pledged'] ]#figs in million dollars
pd.set_option('display.float_format', lambda x: '%.3f' % x)

top_subcat_money_us_pledged

In [None]:
top_subcat_money_us_pledged = us_succ_ks_sub_cat.groupby('sub_category').sum()[['goal_usd','usd_pledged'] ]#figs in million dollars
pd.set_option('display.float_format', lambda x: '%.3f' % x)

top_subcat_money_us_pledged

# Lets look at the unsucessful campaigns in US

In [None]:
failed_ks = df[df['status'] == 'failed']

In [None]:
failed_ks_us = failed_ks[failed_ks['country'] == 'US']

In [None]:
failed_ks_us

In [None]:
failed_goal = failed_ks_us['goal_usd'].mean()
failed_goal

In [None]:
failed_pledged = failed_ks_us['usd_pledged'].mean()
failed_pledged

### It is clearly visible that in failed campaigns in US they were not able to reach their goal usd

In [None]:
failed_ks_us['launched year'].value_counts()

In [None]:
plt.figure(figsize=(9,7),dpi=100)
sns.countplot(data=failed_ks_us,x='launched year')

plt.title('Year of failed kickstarters in US');

# The year 2015 saw most no of failed kickstarters in US.

In [None]:
fail_cat_us = failed_ks_us['main_category'].value_counts()
fail_cat_us

In [None]:
# Lets see the category of campaigns failed in us

plt.figure(figsize=(9,7),dpi=90)

sns.barplot(x=fail_cat_us.values,y=fail_cat_us.index)
plt.title('Count - Failed KS category US ')

### The most failed kickstart category in US are film&video & food

In [None]:
money_us_pledged_fail = failed_ks_us.groupby('main_category').sum()[['goal_usd','usd_pledged'] ]#figs in million dollars
pd.set_option('display.float_format', lambda x: '%.5f' % x)
money_us_pledged_fail

In [None]:
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(data=money_us_pledged_fail,y=money_us_pledged_fail.index,x='goal_usd')
plt.title('Goal USD of the failed KS campaigns by main category in US')
plt.xlabel('Count')
plt.ylabel('Main category');
#plt.xlim(0,10)

In [None]:
plt.figure(figsize=(9,7))

sns.barplot(data=money_us_pledged_fail,y=money_us_pledged_fail.index,x='usd_pledged')

plt.title('USD pledged by Failed KS campaigns by main category in US')
plt.xlabel('Count')
plt.ylabel('Sub category');

In [None]:
failed_ks_us.groupby('main_category').mean()[['duration','campaign span days']]

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
failed_ks_us.groupby(['main_category','start_Q']).sum()[['goal_usd','usd_pledged']]


In [None]:
#These campaingns were also up online and working for most the span

In [None]:
failed_ks_us['start_month'].value_counts() #lets see in which months are the most failed projects launched

In [None]:
#the distribution looks like somewhat even

In [None]:
failed_ks_us['start_Q'].value_counts() #lets see in which quarter most failed projects were found

In [None]:
failed_ks_us['city'].value_counts().sort_values(ascending=False)[0:5]  #Top 5 cities of failed ks

# Lets analyze the data of rest of the world - countries other than us

In [None]:
rest_of_world = df[df['country'] != 'US']

In [None]:
rest_of_world['country']

In [None]:
rest_of_world.head()

In [None]:
#success rate of campaigns in countries other than US

result=rest_of_world['status'].value_counts()
fig = plt.figure(dpi=120)
label = 'Successful','Failed'
sizes = result.values

plt.pie(sizes,labels = label,autopct='%1.1f%%');

print('In countries other than USA More than half of the projects have succeded in raising the funds for their projects')

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.countplot(data = rest_of_world, y='main_category',hue='status')

plt.title('Successful and failed campaigns by category for non US countries')

## From the above plot we can see that for the rest of the countries the most successfull ks campaign category was film&video

## The most failed ks campaigns category - technology

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.countplot(data = rest_of_world, x='start_Q',hue='status')

plt.title('Successful and failed campaigns by Start Quarter of the year for non US countries')

plt.legend(bbox_to_anchor=(1.25,1))

#### As we can see from above plot we can see the the success of a campaign does not depend on which quarter the were launched in

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.countplot(data = rest_of_world, x='launched year',hue='status')

plt.title('Successful and failed campaigns by launched year for non US countries')

plt.legend(bbox_to_anchor=(1.25,1));

## In Non US countries 2018 has been the most successful year for KS campaigns and the year 2015 has seen the highest number of failed KS campaigns

## Lets See the data about successfull campaigns in countries other than US

In [None]:
rest_success= rest_of_world[rest_of_world['status'] == 'successful']

In [None]:
rest_success

In [None]:
rest_su_cat = rest_success['main_category'].value_counts()
rest_su_cat

In [None]:
plt.figure(figsize=(9,7),dpi=900)

sns.barplot(x=rest_su_cat.values,y=rest_su_cat.index);

### As we saw earlier film&video is the most successful KS campaign in countries other than US

In [None]:
rest_su_year = rest_success['launched year'].value_counts()
rest_su_year

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.barplot(y=rest_su_year.values,x=rest_su_year.index);

### Turns out in rest of the countries other than US the most successful projects were launched in the year 2018

In [None]:
nonus_money_us_pledged = rest_success.groupby('main_category').sum()[['goal_usd','usd_pledged'] ]#figs in million dollars
pd.set_option('display.float_format', lambda x: '%.3f' % x)
nonus_money_us_pledged

In [None]:
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(data = nonus_money_us_pledged,y=nonus_money_us_pledged.index,x=nonus_money_us_pledged.goal_usd)

plt.title('Goal USD of successful campaigns in Non-US countries by main category');

In [None]:
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(data = nonus_money_us_pledged,y=nonus_money_us_pledged.index,x=nonus_money_us_pledged.usd_pledged)

plt.title('USD plegded by successful campaigns in Non-US countries by main category');

### The thing to be noted is that though technology categroy shows us that it racked in more funding than the top successful category which was film&video, it should be noted that the success parameter here does not depend on the money pledged but it does depend on how many campaigns were successful in meeting their goal funding, so in that way film&video category remains the most successful category

# Now let us see the top sub category of the most sucessful main category of campaign in countries other than US

In [None]:
non_us_top_maincat = rest_success[rest_success['main_category'] == 'film & video']

In [None]:
non_us_top_maincat

In [None]:
top_subcat_nonus = non_us_top_maincat['sub_category'].value_counts()
top_subcat_nonus

# Shorts - maybe shorts film category is the top sub category of the most successful maincategory kikstarter campaigns of the non US countries

In [None]:
top_subcat_nonus_curr = non_us_top_maincat.groupby('sub_category').sum()[['goal_usd','usd_pledged'] ]
pd.set_option('display.float_format', lambda x: '%.3f' % x)
top_subcat_nonus_curr

### We can see that though shorts has most successful no of campaigns documentary ks campaings has pledged more money. But as we already saw above success depends on achieving goal funding.

In [None]:
plt.figure(figsize=(9,7),dpi=90)

sns.barplot(data = top_subcat_nonus_curr,y=top_subcat_nonus_curr.index,x=top_subcat_nonus_curr.usd_pledged)

plt.title('USD plegded of most successful sub category of the most sucessful main category of campaigns in countries other than US');

# Now let us see anlayze the failed Kickstarter campaigns of non US countries

In [None]:
rest_failed = rest_of_world[rest_of_world['status'] == 'failed']

In [None]:
rest_failed_year = rest_failed['launched year'].value_counts()
rest_failed_year

In [None]:
# It can be seen that out of all the campaigns launched in Non US countries 

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.barplot(x=rest_failed_year.index,y=rest_failed_year.values)
plt.title('Launched year of failed Kickstarter campaigns in Non US countries')
plt.xlabel('Year')
plt.ylabel('Count')

### 2015 is the year most KS campaigns failed in Non US countries.

In [None]:
rest_fail_cat = rest_failed['main_category'].value_counts()
rest_fail_cat

In [None]:
rest_failed['goal_usd'].mean()

In [None]:
rest_failed['usd_pledged'].mean()

### Technology seems to be the top most failed category of KS campaigns in non US countries

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.barplot(y=rest_fail_cat.index,x=rest_fail_cat.values)
plt.xlabel('Count')
plt.ylabel('Main category')
plt.title('Most failed category of KS campaigns categories in non US countries');

In [None]:
top_rest_fail_cat = rest_failed.groupby('main_category').sum()[['goal_usd','usd_pledged'] ]
pd.set_option('display.float_format', lambda x: '%.3f' % x)
top_rest_fail_cat

### Now let us see the top sub category of the top main category of failed campaigns in countries other than US


In [None]:
top_sub_fail_cat = rest_failed[rest_failed['main_category'] == 'technology']

In [None]:
top_failed_subcat_nonus = top_sub_fail_cat['sub_category'].value_counts()
top_failed_subcat_nonus

### Web - is the sub category with most failed campaigns in the most failed main category campaigns in non US countries

In [None]:
plt.figure(figsize=(9,7),dpi=100)

sns.barplot(y=top_failed_subcat_nonus.index,x=top_failed_subcat_nonus.values)
plt.ylabel('sub category')
plt.xlabel('Count')
plt.title('Failed Kickstarter camapigns by sub category of top most failed KS campaign by main categories in non US countries');

In [None]:
top_sub_fail_cat_curr = top_sub_fail_cat.groupby('sub_category').sum()[['goal_usd','usd_pledged'] ]
top_sub_fail_cat_curr