# Classification Challenge

The dataset here comes from a kaggle example on Kickstarter projects.  Your goal is the following:

- Load and explore the data
- Determine strategy for missing variables
- Build classifier to predict `state` column.  
- Compare and visualize the `ROC` curve for three different classifiers:
 - `LogisticRegression`
 - `KNeighborsClassifier`
 - `DecisionTreeClassifier`
 
- What did the `DecisionTreeClassifier` decide were the most important features?  Visualize the top five.
- Visualize a `DecisionTreeClassifier` with depth 3, and describe the results.

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/ks-projects.csv.zip', encoding='Windows-1252', compression='zip')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


In [5]:
df.isnull().sum()

ID                     0
name                   4
category               5
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3790
Unnamed: 13       323125
Unnamed: 14       323738
Unnamed: 15       323746
Unnamed: 16       323749
dtype: int64

In [6]:
df.shape

(323750, 17)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323750 entries, 0 to 323749
Data columns (total 17 columns):
ID                323750 non-null int64
name              323746 non-null object
category          323745 non-null object
main_category     323750 non-null object
currency          323750 non-null object
deadline          323750 non-null object
goal              323750 non-null object
launched          323750 non-null object
pledged           323750 non-null object
state             323750 non-null object
backers           323750 non-null object
country           323750 non-null object
usd pledged       319960 non-null object
Unnamed: 13       625 non-null object
Unnamed: 14       12 non-null object
Unnamed: 15       4 non-null object
Unnamed: 16       1 non-null float64
dtypes: float64(1), int64(1), object(15)
memory usage: 42.0+ MB


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [9]:
states = pd.get_dummies(df['state '])

In [10]:
states

Unnamed: 0,0,1,10,100,1010,103,1035,1056,10564,106,...,971,9748,9857.8,9951,canceled,failed,live,successful,suspended,undefined
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
df[df['state '] == 'failed'].shape

(168221, 17)

In [15]:
df[(df['state '] == 'failed') | (df['state '] == 'canceled') | (df['state '] == 'successful')].shape

(313656, 17)

In [16]:
fail = []
for entry in df['state ']:
    if entry == 'successful':
        fail.append(1)
    else:
        fail.append(0)

In [17]:
fail[:10]

[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]

In [19]:
df['success'] = fail

In [20]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,sucess,success
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,,0,0
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,,0,0
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,,0,0
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,,0,0
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,,1,1


In [21]:
df.drop(['sucess'], axis = 1)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,success
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,,0
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,,0
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,,0
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,,0
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,,1
5,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21 18:30:44,1000,2014-12-01 18:30:44,1205,successful,16,US,1205,,,,,1
6,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17 19:05:12,25000,2016-02-01 20:05:12,453,failed,40,US,453,,,,,0
7,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29 18:14:43,125000,2014-04-24 18:14:43,8233,canceled,58,US,8233,,,,,0
8,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10 21:55:48,65000,2014-07-11 21:55:48,6240.57,canceled,43,US,6240.57,,,,,0
9,100004721,Of Jesus and Madmen,Nonfiction,Publishing,CAD,2013-10-09 18:19:37,2500,2013-09-09 18:19:37,0,failed,0,CA,0,,,,,0


In [33]:
df['usd pledged '].value_counts()

0                 50462
1                  5041
25                 3122
10                 3111
50                 2673
5                  2315
100                2249
20                 2010
30                 1462
2                  1399
15                 1119
35                 1010
40                 1007
60                 1006
75                  943
150                 911
200                 838
125                 795
11                  692
55                  674
70                  671
110                 656
26                  652
6                   628
45                  613
80                  610
120                 566
51                  557
500                 553
250                 553
                  ...  
63830                 1
6644.5304776          1
784.02757425          1
39413.2200979         1
17.50207476           1
9277.55               1
4.97547093            1
22856.7670428         1
16.72334025           1
30343.24393879        1
34424.14        

In [23]:
X = df['usd pledged ']
y = df.success

In [26]:
X_train, y_train, X_test, y_test = train_test_split(X, y)

In [27]:
clf = LogisticRegression()

In [28]:
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'US'

----------------------

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
cats = pd.get_dummies(df['main_category '])

In [12]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,4), min_df=5)
tfs = tfidf.fit_transform(df['name '].dropna())

In [13]:
tfs_array = tfs.toarray()
df_2= pd.DataFrame(tfs_array, columns=tfidf.get_feature_names())

In [14]:
df_2.head()

Unnamed: 0,00,000,000 000,000 feet,000 mile,000 miles,000 people,000 words,000 years,001,...,zu,zucati,zucati dice,zulu,zum,zur,álbum,ça,être,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
states.shape

(323750, 410)

In [16]:
cats.shape

(323750, 120)

In [17]:
df_2.shape

(323746, 37089)

In [None]:
df_combo = pd.concat([cats, df_2], axis = 1)

In [None]:
X = df_combo
y = 