In [1]:
# Packages to be used.

import numpy as np # linear algebra
import pandas as pd # for data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model


In [2]:
# Data import from the csv file.

ksdata = pd.read_csv('ksdata_updated.csv', engine ='python') # engine ='python' - used to correct encoding errors.

In [3]:
ksdata.head()

Unnamed: 0,ID,name,category,main_category,country,currency,launched,deadline,goal (home currency),goal (converted to USD),backers,pledged (home currency),pledged (converted to USD),state,usd pledged
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GB,GBP,8/11/15,10/9/15,1000.0,1533.95,0,0.0,0.0,failed,0.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,US,USD,9/2/17,11/1/17,30000.0,30000.0,15,2421.0,2421.0,failed,100.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,US,USD,1/12/13,2/26/13,45000.0,45000.0,3,220.0,220.0,failed,220.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,US,USD,3/17/12,4/16/12,5000.0,5000.0,1,1.0,1.0,failed,1.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,US,USD,7/4/15,8/29/15,19500.0,19500.0,14,1283.0,1283.0,canceled,1283.0


In [4]:
# Number of columns and rows
ksdata.shape

(378661, 15)

In [5]:
ksdata.columns

Index(['ID', 'name', 'category', 'main_category', 'country', 'currency',
       'launched', 'deadline', 'goal (home currency)',
       'goal (converted to USD)', 'backers', 'pledged (home currency)',
       'pledged (converted to USD)', 'state', 'usd pledged'],
      dtype='object')

In [6]:
ksdata.describe() # Stats summary

Unnamed: 0,ID,goal (home currency),goal (converted to USD),backers,pledged (home currency),pledged (converted to USD),usd pledged
count,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,374864.0
mean,1074731000.0,49080.79,45454.4,105.617476,9682.979,9058.924,7036.729
std,619086200.0,1183391.0,1152950.0,907.185035,95636.01,90973.34,78639.75
min,5971.0,0.01,0.01,0.0,0.0,0.0,0.0
25%,538263500.0,2000.0,2000.0,2.0,30.0,31.0,16.98
50%,1075276000.0,5200.0,5500.0,12.0,620.0,624.33,394.72
75%,1610149000.0,16000.0,15500.0,56.0,4076.0,4050.0,3034.09
max,2147476000.0,100000000.0,166361400.0,219382.0,20338990.0,20338990.0,20338990.0


In [7]:
# Drop the unnecessary columns - id and name. These columns have no significance to the model.
# Drop the 
ksdata.drop(['ID', 'name'],axis=1,inplace=True)

In [8]:
ksdata.head()

Unnamed: 0,category,main_category,country,currency,launched,deadline,goal (home currency),goal (converted to USD),backers,pledged (home currency),pledged (converted to USD),state,usd pledged
0,Poetry,Publishing,GB,GBP,8/11/15,10/9/15,1000.0,1533.95,0,0.0,0.0,failed,0.0
1,Narrative Film,Film & Video,US,USD,9/2/17,11/1/17,30000.0,30000.0,15,2421.0,2421.0,failed,100.0
2,Narrative Film,Film & Video,US,USD,1/12/13,2/26/13,45000.0,45000.0,3,220.0,220.0,failed,220.0
3,Music,Music,US,USD,3/17/12,4/16/12,5000.0,5000.0,1,1.0,1.0,failed,1.0
4,Film & Video,Film & Video,US,USD,7/4/15,8/29/15,19500.0,19500.0,14,1283.0,1283.0,canceled,1283.0


In [9]:
ksdata.info() # Clean dataset with no NAs or missing values. However, it has objects, which won't work for the model.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 13 columns):
category                      378661 non-null object
main_category                 378661 non-null object
country                       378661 non-null object
currency                      378661 non-null object
launched                      378661 non-null object
deadline                      378661 non-null object
goal (home currency)          378661 non-null float64
goal (converted to USD)       378661 non-null float64
backers                       378661 non-null int64
pledged (home currency)       378661 non-null float64
pledged (converted to USD)    378661 non-null float64
state                         378661 non-null object
usd pledged                   374864 non-null float64
dtypes: float64(5), int64(1), object(7)
memory usage: 37.6+ MB


In [10]:
# List all columns with "objects", and a count of all names in them.
cols = ['category','main_category', 'currency','state', 'country']
for name in cols:
    print(name,':')
    print(ksdata[name].value_counts(),'\n')

category :
Product Design     22314
Documentary        16139
Music              15727
Tabletop Games     14180
Shorts             12357
                   ...  
Residencies           69
Letterpress           49
Chiptune              35
Literary Spaces       27
Taxidermy             13
Name: category, Length: 159, dtype: int64 

main_category :
Film & Video    63585
Music           51918
Publishing      39874
Games           35231
Technology      32569
Design          30070
Art             28153
Food            24602
Fashion         22816
Theater         10913
Comics          10819
Photography     10779
Crafts           8809
Journalism       4755
Dance            3768
Name: main_category, dtype: int64 

currency :
USD    295365
GBP     34132
EUR     17405
CAD     14962
AUD      7950
SEK      1788
MXN      1752
NZD      1475
DKK      1129
CHF       768
NOK       722
HKD       618
SGD       555
JPY        40
Name: currency, dtype: int64 

state :
failed        197719
successful    133956


In [11]:
ksdata.drop(['category', 'currency'],axis=1,inplace=True)

In [12]:
ksdata.columns

Index(['main_category', 'country', 'launched', 'deadline',
       'goal (home currency)', 'goal (converted to USD)', 'backers',
       'pledged (home currency)', 'pledged (converted to USD)', 'state',
       'usd pledged'],
      dtype='object')

In [13]:
# Use pandas’ get_dummies() method to return a new DataFrame containing a new column for each dummy variable.
# Use the concat() method to add these dummy columns back to the original DataFrame.
# Drop the original columns entirely using the drop method.
nominal_columns = ["main_category", "country"]
dummy_df = pd.get_dummies(ksdata[nominal_columns])
ksdata = pd.concat([ksdata, dummy_df], axis=1)
ksdata = ksdata.drop(nominal_columns, axis=1)

In [14]:
# Examine data - columns have been added (dummy variables)
ksdata.head()

Unnamed: 0,launched,deadline,goal (home currency),goal (converted to USD),backers,pledged (home currency),pledged (converted to USD),state,usd pledged,main_category_Art,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,8/11/15,10/9/15,1000.0,1533.95,0,0.0,0.0,failed,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,9/2/17,11/1/17,30000.0,30000.0,15,2421.0,2421.0,failed,100.0,0,...,0,0,0,0,0,0,0,0,0,1
2,1/12/13,2/26/13,45000.0,45000.0,3,220.0,220.0,failed,220.0,0,...,0,0,0,0,0,0,0,0,0,1
3,3/17/12,4/16/12,5000.0,5000.0,1,1.0,1.0,failed,1.0,0,...,0,0,0,0,0,0,0,0,0,1
4,7/4/15,8/29/15,19500.0,19500.0,14,1283.0,1283.0,canceled,1283.0,0,...,0,0,0,0,0,0,0,0,0,1


In [15]:
# inspect our final output from this section to make sure all the features are of the same length, contain no null value, and are numerical. 
ksdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 47 columns):
launched                      378661 non-null object
deadline                      378661 non-null object
goal (home currency)          378661 non-null float64
goal (converted to USD)       378661 non-null float64
backers                       378661 non-null int64
pledged (home currency)       378661 non-null float64
pledged (converted to USD)    378661 non-null float64
state                         378661 non-null object
usd pledged                   374864 non-null float64
main_category_Art             378661 non-null uint8
main_category_Comics          378661 non-null uint8
main_category_Crafts          378661 non-null uint8
main_category_Dance           378661 non-null uint8
main_category_Design          378661 non-null uint8
main_category_Fashion         378661 non-null uint8
main_category_Film & Video    378661 non-null uint8
main_category_Food            378661 non-nul

In [16]:
ksdata.state = [1 if each == 'successful'  else 0 for each in ksdata.state.values]

In [17]:
#ksdata['deadline'] =  pd.to_datetime(ksdata['deadline'])

In [18]:
#ksdata['launched'] =  pd.to_datetime(ksdata['launched'])

In [19]:
ksdata.isnull().sum()

launched                         0
deadline                         0
goal (home currency)             0
goal (converted to USD)          0
backers                          0
pledged (home currency)          0
pledged (converted to USD)       0
state                            0
usd pledged                   3797
main_category_Art                0
main_category_Comics             0
main_category_Crafts             0
main_category_Dance              0
main_category_Design             0
main_category_Fashion            0
main_category_Film & Video       0
main_category_Food               0
main_category_Games              0
main_category_Journalism         0
main_category_Music              0
main_category_Photography        0
main_category_Publishing         0
main_category_Technology         0
main_category_Theater            0
country_AT                       0
country_AU                       0
country_BE                       0
country_CA                       0
country_CH          

In [20]:
y = ksdata.state.values
#our y axis is defined "state" values.

In [34]:
# x_data is all of features except "state".
# 'pledged' - overfits the model.
# 'usd pledged' - duplicate with missing values.
# 'usd_pledged_real' - overfits the model.

x = ksdata.drop(['state','deadline','launched'],axis=1)

In [35]:
print (x)

        goal (home currency)  goal (converted to USD)  backers  \
0                     1000.0                  1533.95        0   
1                    30000.0                 30000.00       15   
2                    45000.0                 45000.00        3   
3                     5000.0                  5000.00        1   
4                    19500.0                 19500.00       14   
...                      ...                      ...      ...   
378656               50000.0                 50000.00        1   
378657                1500.0                  1500.00        5   
378658               15000.0                 15000.00        1   
378659               15000.0                 15000.00        6   
378660                2000.0                  2000.00       17   

        pledged (home currency)  pledged (converted to USD)  usd pledged  \
0                           0.0                         0.0          0.0   
1                        2421.0                      24

In [22]:
# Examine data - columns have been added (dummy variables)
ksdata.head()

Unnamed: 0,launched,deadline,goal (home currency),goal (converted to USD),backers,pledged (home currency),pledged (converted to USD),state,usd pledged,main_category_Art,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,8/11/15,10/9/15,1000.0,1533.95,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,9/2/17,11/1/17,30000.0,30000.0,15,2421.0,2421.0,0,100.0,0,...,0,0,0,0,0,0,0,0,0,1
2,1/12/13,2/26/13,45000.0,45000.0,3,220.0,220.0,0,220.0,0,...,0,0,0,0,0,0,0,0,0,1
3,3/17/12,4/16/12,5000.0,5000.0,1,1.0,1.0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
4,7/4/15,8/29/15,19500.0,19500.0,14,1283.0,1283.0,0,1283.0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
#Train-Test Datas Split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
#test_size=0.2 means %20 test datas, %80 train datas

NameError: name 'x' is not defined

In [None]:
x_train.isnull().sum()

In [None]:
from sklearn.linear_model import LogisticRegression
#logmodel = LogisticRegression()
logmodel= linear_model.LogisticRegression(random_state=0,max_iter=500)
# max_iter is optional parameter. You can write 10 or 3000 if you want.

In [None]:
logmodel.fit(x_train, y_train)

In [None]:
print("Train accuracy {}".format(logmodel.fit(x_train,y_train).score(x_train,y_train)))

print("Test accuracy {}".format(logmodel.fit(x_train,y_train).score(x_test,y_test)))


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
predicted = logmodel.predict(x_test)
print(predicted, y_test)

In [None]:
from sklearn import metrics
print (metrics.classification_report(y_test, predicted))