In [98]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP").getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


In [99]:
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# For pipeline development
from pyspark.ml import Pipeline

In [100]:
path = "/home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/"

df=spark.read.csv(path+'kickstarter.csv', inferSchema=True, header=True)

In [101]:
df.limit(4).toPandas()

23/01/25 10:14:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,_c0,blurb,state
0,1,"Using their own character, users go on educati...",failed
1,2,"MicroFly is a quadcopter packed with WiFi, 6 s...",successful
2,3,"A small indie press, run as a collective for a...",failed
3,4,Zylor is a new baby cosplayer! Back this kicks...,failed


In [102]:
df.show(4, False)

23/01/25 10:14:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+---+-----------------------------------------------------------------------------------------------------------------------------------+----------+
|_c0|blurb                                                                                                                              |state     |
+---+-----------------------------------------------------------------------------------------------------------------------------------+----------+
|1  |Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills (ie Physics). |failed    |
|2  |MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 p

In [103]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- state: string (nullable = true)



In [104]:
df.count()

223627

In [105]:
def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        if(nullRows>0):
            temp = k,nullRows,(nullRows/numRows)*100
            null_columns_counts.append(temp)
    return(null_columns_counts)

null_columns_calc_list = null_value_calc(df)
spark.createDataFrame(null_columns_calc_list, ['Column_Name', 'Null_Values_Count','Null_Value_Percent']).show()


23/01/25 10:14:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+-----------+-----------------+------------------+
|Column_Name|Null_Values_Count|Null_Value_Percent|
+-----------+-----------------+------------------+
|      blurb|             1488|0.6653937136392296|
|      state|            13157| 5.883457722010312|
+-----------+-----------------+------------------+



In [106]:
df.na.drop().count()

23/01/25 10:14:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


210470

In [107]:
df = df.dropna()

In [108]:
df.count()

23/01/25 10:14:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


210470

In [109]:
df.groupBy("state").count().orderBy(col("count").desc()).show()

23/01/25 10:14:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+--------------------+------+
|               state| count|
+--------------------+------+
|          successful|103582|
|              failed|102000|
| and get some col...|     8|
|          ","failed"|     6|
|     their childhood|     6|
|                love|     6|
| about a lonely f...|     5|
|             romance|     4|
|              poetry|     4|
|            mastered|     4|
|                  CD|     3|
| She Wrote"" but ...|     3|
|               music|     3|
|                NY."|     3|
|              2015."|     3|
|            equality|     3|
|               2014"|     3|
|              2014."|     3|
|             Texas."|     3|
| 

In [110]:
df = df.filter("state IN('successful','failed')")

In [111]:
df.groupBy("state").count().orderBy(col("count").desc()).show()

23/01/25 10:14:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+----------+------+
|     state| count|
+----------+------+
|successful|103582|
|    failed|102000|
+----------+------+



In [112]:
df.select("blurb").show(10, False)

23/01/25 10:14:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+-----------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------+
|Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills (ie Physics). |
|MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your

In [113]:
df = df.withColumn("blurb", translate(col("blurb"), "/"," ")) \
    .withColumn("blurb", translate(col("blurb"), "("," ")) \
    .withColumn("blurb", translate(col("blurb"), ")"," "))

In [114]:
df.select("blurb").show(10, False)

23/01/25 10:14:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+-----------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------+
|Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills  ie Physics . |
|MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your

In [115]:
df = df.withColumn("blurb", regexp_replace(col("blurb"), "[^A-Za-z ]+", ""))

In [116]:
df.select("blurb").show(10, False)

23/01/25 10:14:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+-------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                          |
+-------------------------------------------------------------------------------------------------------------------------------+
|Using their own character users go on educational quests around a virtual world leveling up subjectoriented skills  ie Physics |
|MicroFly is a quadcopter packed with WiFi  sensors and  processors for ultimate stability  and fits in the palm of your hand   |
|A small ind

In [117]:
df = df.withColumn("blurb", regexp_replace(col("blurb"), " +", " "))

In [118]:
df.select("blurb").show(10, False)

23/01/25 10:14:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|Using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie Physics |
|MicroFly is a quadcopter packed with WiFi sensors and processors for ultimate stability and fits in the palm of your hand     |
|A small indie pr

In [119]:
df = df.withColumn("blurb", lower(col("blurb")))

In [120]:
df.select("blurb").show(10, False)

23/01/25 10:14:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie physics |
|microfly is a quadcopter packed with wifi sensors and processors for ultimate stability and fits in the palm of your hand     |
|a small indie pr

In [121]:
regex_tokenizer = RegexTokenizer(inputCol = "blurb", outputCol = "words", pattern = "\\W")
raw_words = regex_tokenizer.transform(df)
raw_words.show(2, False)

23/01/25 10:14:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+---+------------------------------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0|blurb                                                                                                                         |state     |words                                                                                                                                            |
+---+-----------------------------------------------------------------------------

In [122]:
raw_words.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- state: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [123]:
remover = StopWordsRemover(inputCol="words", outputCol = "filtered")

In [124]:
stopwords = remover.getStopWords()
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [125]:
words_df = remover.transform(raw_words)

In [126]:
words_df.limit(4).toPandas()

23/01/25 10:14:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,_c0,blurb,state,words,filtered
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que..."
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ..."
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors..."
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte..."


In [127]:
indexer = StringIndexer(inputCol='state', outputCol='label')
feature_data = indexer.fit(words_df).transform(words_df)

23/01/25 10:14:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


In [128]:
feature_data.limit(4).toPandas()

23/01/25 10:14:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,_c0,blurb,state,words,filtered,label
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",1.0
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ...",0.0
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",1.0
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte...",1.0


In [129]:
############################ BEFORE ###############################
regex_tokenizer = RegexTokenizer(inputCol = "blurb", outputCol = "words", pattern = "\\W")
raw_words = regex_tokenizer.transform(df)

remover = StopWordsRemover(inputCol="words", outputCol = "filtered")
words_df = remover.transform(raw_words)

indexer = StringIndexer(inputCol='state', outputCol='label')
feature_data = indexer.fit(words_df).transform(words_df)

feature_data.show(1, False)

23/01/25 10:14:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
23/01/25 10:14:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------

In [130]:
############################ AFTER ###############################
regex_tokenizer = RegexTokenizer(inputCol = "blurb", outputCol = "words", pattern = "\\W")
# raw_words = regex_tokenizer.transform(df)

remover = StopWordsRemover(inputCol=regex_tokenizer.getOutputCol(), outputCol = "filtered")
# words_df = remover.transform(raw_words)

indexer = StringIndexer(inputCol='state', outputCol='label')
#feature_data = indexer.fit(words_df).transform(words_df)

pipeline = Pipeline(stages=[regex_tokenizer, remover, indexer])
data_prep_pl = pipeline.fit(df)

feature_data = data_prep_pl.transform(df)

feature_data.show(1, False)

23/01/25 10:14:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
23/01/25 10:14:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------

In [131]:
feature_data.limit(5).toPandas()

23/01/25 10:14:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,_c0,blurb,state,words,filtered,label
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",1.0
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ...",0.0
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",1.0
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte...",1.0
4,5,hatoful boyfriend meet skeletons a comedy dati...,failed,"[hatoful, boyfriend, meet, skeletons, a, comed...","[hatoful, boyfriend, meet, skeletons, comedy, ...",1.0


In [132]:
hashingTF = HashingTF(inputCol='filtered', outputCol='rawfeatures', numFeatures=20)
HTFfeaturizedData = hashingTF.transform(feature_data)

In [133]:
HTFfeaturizedData.show(1, False)

23/01/25 10:14:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+-----+-------------------------------------------------------------------------+
|_c0|blurb                                                                                                                         |state |words                             

In [134]:
idf = IDF(inputCol='rawfeatures', outputCol='features')
idfModel = idf.fit(HTFfeaturizedData)
TFIDFfeaturizedData = idfModel.transform(HTFfeaturizedData)
TFIDFfeaturizedData.name = 'TFIDFfeaturizedData'

23/01/25 10:14:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

In [135]:
TFIDFfeaturizedData.limit(4).toPandas()

23/01/25 10:14:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,_c0,blurb,state,words,filtered,label,rawfeatures,features
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",1.0,"(3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, ...","(2.252148827177929, 0.0, 0.8915391572399594, 0..."
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ...",0.0,"(1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, ...","(0.7507162757259763, 0.9418501584120335, 0.0, ..."
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",1.0,"(3.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","(2.252148827177929, 0.0, 0.8915391572399594, 0..."
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte...",1.0,"(2.0, 0.0, 0.0, 1.0, 1.0, 3.0, 0.0, 2.0, 2.0, ...","(1.5014325514519526, 0.0, 0.0, 0.6648665945302..."


In [136]:
HTFfeaturizedData = HTFfeaturizedData.withColumnRenamed("rawfeatures","features")
HTFfeaturizedData.name = 'HTFfeaturizedData'

In [137]:
Word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='filtered', outputCol='features')
model = Word2Vec.fit(feature_data)

W2VfeaturizedData = model.transform(feature_data)
W2VfeaturizedData.limit(4).toPandas()

23/01/25 10:14:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:14:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,_c0,blurb,state,words,filtered,label,features
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",1.0,"[0.14038110065406986, -0.24811659927114044, 0...."
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ...",0.0,"[0.27672377347268845, -0.3168490545146845, -0...."
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",1.0,"[0.08995955561598142, 0.28520162504476804, 0.1..."
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte...",1.0,"[0.24012657121888228, 0.25589268415101935, 0.1..."


In [138]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')
scalerModel = scaler.fit(W2VfeaturizedData)
scaled_data = scalerModel.transform(W2VfeaturizedData)
scaled_data.limit(4).toPandas()

23/01/25 10:15:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv




23/01/25 10:15:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

Unnamed: 0,_c0,blurb,state,words,filtered,label,features,scaledFeatures
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",1.0,"[0.14038110065406986, -0.24811659927114044, 0....","[0.5792267598773768, 0.49325837641687337, 0.59..."
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ...",0.0,"[0.27672377347268845, -0.3168490545146845, -0....","[0.6256666184030814, 0.46945486621258775, 0.53..."
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",1.0,"[0.08995955561598142, 0.28520162504476804, 0.1...","[0.5620526105827438, 0.6779578135263551, 0.657..."
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte...",1.0,"[0.24012657121888228, 0.25589268415101935, 0.1...","[0.6132011968298638, 0.6678075043140753, 0.639..."


In [139]:
W2VfeaturizedData = scaled_data.select('state','blurb','label','scaledFeatures')
W2VfeaturizedData = W2VfeaturizedData.withColumnRenamed('scaledFeatures','features')
W2VfeaturizedData.name = 'W2VfeaturizedData'

In [140]:
W2VfeaturizedData.limit(3).toPandas()

23/01/25 10:15:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


Unnamed: 0,state,blurb,label,features
0,failed,using their own character users go on educatio...,1.0,"[0.5792267598773768, 0.49325837641687337, 0.59..."
1,successful,microfly is a quadcopter packed with wifi sens...,0.0,"[0.6256666184030814, 0.46945486621258775, 0.53..."
2,failed,a small indie press run as a collective for au...,1.0,"[0.5620526105827438, 0.6779578135263551, 0.657..."


In [141]:
def ClassTrainEval(classifier,features,classes,train,test):

    def FindMtype(classifier):
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(classifier)
    

    def IntanceFitModel(Mtype,classifier,classes,features,train):
        
        if Mtype == "OneVsRest":
            # instantiate the base classifier.
            lr = LogisticRegression()
            # instantiate the One Vs Rest Classifier.
            OVRclassifier = OneVsRest(classifier=lr)
#             fitModel = OVRclassifier.fit(train)
            # Add parameters of your choice here:
            paramGrid = ParamGridBuilder() \
                .addGrid(lr.regParam, [0.1, 0.01]) \
                .build()
            #Cross Validator requires the following parameters:
            crossval = CrossValidator(estimator=OVRclassifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=2) # 3 is best practice
            # Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
        if Mtype == "MultilayerPerceptronClassifier":
            # specify layers for the neural network:
            # input layer of size features, two intermediate of features+1 and same size as features
            # and output of size number of classes
            # Note: crossvalidator cannot be used here
            features_count = len(features[0][0])
            layers = [features_count, features_count+1, features_count, classes]
            MPC_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
            fitModel = MPC_classifier.fit(train)
            return fitModel
        if Mtype in("LinearSVC","GBTClassifier") and classes != 2: # These classifiers currently only accept binary classification
            print(Mtype," could not be used because PySpark currently only accepts binary classification data for this algorithm")
            return
        if Mtype in("LogisticRegression","NaiveBayes","RandomForestClassifier","GBTClassifier","LinearSVC","DecisionTreeClassifier"):
  
            # Add parameters of your choice here:
            if Mtype in("LogisticRegression"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .addGrid(classifier.maxIter, [10, 15,20])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("NaiveBayes"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.smoothing, [0.0, 0.2, 0.4, 0.6]) \
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("RandomForestClassifier"):
                paramGrid = (ParamGridBuilder() \
                               .addGrid(classifier.maxDepth, [2, 5, 10])
#                                .addGrid(classifier.maxBins, [5, 10, 20])
#                                .addGrid(classifier.numTrees, [5, 20, 50])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("GBTClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
#                              .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .addGrid(classifier.maxIter, [10, 15,50,100])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("LinearSVC"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.maxIter, [10, 15]) \
                             .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .build())
            
            # Add parameters of your choice here:
            if Mtype in("DecisionTreeClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
                             .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .build())
            
            #Cross Validator requires all of the following parameters:
            crossval = CrossValidator(estimator=classifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=2) # 3 + is best practice
            # Fit Model: Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
    
    fitModel = IntanceFitModel(Mtype,classifier,classes,features,train)
    
    # Print feature selection metrics
    if fitModel is not None:
        
        if Mtype in("OneVsRest"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype + '\033[0m')
            # Extract list of binary models
            models = BestModel.models
            for model in models:
                print('\033[1m' + 'Intercept: '+ '\033[0m',model.intercept,'\033[1m' + '\nCoefficients:'+ '\033[0m',model.coefficients)

        if Mtype == "MultilayerPerceptronClassifier":
            print("")
            print('\033[1m' + Mtype," Weights"+ '\033[0m')
            print('\033[1m' + "Model Weights: "+ '\033[0m',fitModel.weights.size)
            print("")

        if Mtype in("DecisionTreeClassifier", "GBTClassifier","RandomForestClassifier"):
            # FEATURE IMPORTANCES
            # Estimate of the importance of each feature.
            # Each feature’s importance is the average of its importance across all trees 
            # in the ensemble The importance vector is normalized to sum to 1. 
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Feature Importances"+ '\033[0m')
            print("(Scores add up to 1)")
            print("Lowest score is the least important")
            print(" ")
            print(BestModel.featureImportances)
            
            if Mtype in("DecisionTreeClassifier"):
                global DT_featureimportances
                DT_featureimportances = BestModel.featureImportances.toArray()
                global DT_BestModel
                DT_BestModel = BestModel
            if Mtype in("GBTClassifier"):
                global GBT_featureimportances
                GBT_featureimportances = BestModel.featureImportances.toArray()
                global GBT_BestModel
                GBT_BestModel = BestModel
            if Mtype in("RandomForestClassifier"):
                global RF_featureimportances
                RF_featureimportances = BestModel.featureImportances.toArray()
                global RF_BestModel
                RF_BestModel = BestModel

        if Mtype in("LogisticRegression"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficient Matrix"+ '\033[0m')
            print("You should compares these relative to eachother")
            print("Coefficients: \n" + str(BestModel.coefficientMatrix))
            print("Intercept: " + str(BestModel.interceptVector))
            global LR_coefficients
            LR_coefficients = BestModel.coefficientMatrix.toArray()
            global LR_BestModel
            LR_BestModel = BestModel

        if Mtype in("LinearSVC"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficients"+ '\033[0m')
            print("You should compares these relative to eachother")
            print("Coefficients: \n" + str(BestModel.coefficients))
            global LSVC_coefficients
            LSVC_coefficients = BestModel.coefficients.toArray()
            global LSVC_BestModel
            LSVC_BestModel = BestModel
        
   
    # Set the column names to match the external results dataframe that we will join with later:
    columns = ['Classifier', 'Result']
    
    if Mtype in("LinearSVC","GBTClassifier") and classes != 2:
        Mtype = [Mtype] # make this a list
        score = ["N/A"]
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
    else:
        predictions = fitModel.transform(test)
        MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # redictionCol="prediction",
        accuracy = (MC_evaluator.evaluate(predictions))*100
        Mtype = [Mtype] # make this a string
        score = [str(accuracy)] #make this a string and convert to a list
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
        result = result.withColumn('Result',result.Result.substr(0, 5))
        
    return result
    #Also returns the fit model important scores or p values

In [142]:
# from pyspark.ml.classification import *
# from pyspark.ml.evaluation import *
# from pyspark.sql import functions
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Comment out Naive Bayes if your data still contains negative values
classifiers = [
                LogisticRegression()
                ,OneVsRest()
               ,LinearSVC()
               ,NaiveBayes()
               ,RandomForestClassifier()
               ,GBTClassifier()
               ,DecisionTreeClassifier()
               ,MultilayerPerceptronClassifier()
              ] 

featureDF_list = [HTFfeaturizedData,TFIDFfeaturizedData,W2VfeaturizedData]

In [143]:
for featureDF in featureDF_list:
    print(featureDF.name)
    train, test = featureDF.randomSplit([0.7,0.3], seed=11)

    features = featureDF.select(['features']).collect()
    class_count = featureDF.select(countDistinct("label")).collect()
    classes = class_count[0][0]

    columns = ['Classifier','Result']
    vals = [("Place Holder", "N/A")]
    results = spark.createDataFrame(vals, columns)

    for classifier in classifiers:
        new_result = ClassTrainEval(classifier, features, classes, train, test)
        results = results.union(new_result)
    results = results.where("Classifier!='Place Holder'")
    print(results.show(truncate=False))

HTFfeaturizedData
23/01/25 10:15:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
23/01/25 10:15:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[-0.02572065,  0.00980701, -0.088554  ,  0.00511315, -0.01320951,
              -0.11092766, -0.06282471, -0.02965198, -0.02699535, -0.03989592,
               0.0620209 , -0.02636028,  0.008281  ,  0.011904  , -0.02740356,
              -0.03212623,  0.01096433, -0.01975474, -0.08664516, -0.02448302]])
Intercept: [0.26534368368354916]
23/01/25 10:15:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:15:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mOneVsRest[0m
[1mIntercept: [0m -0.24966708207946395 [1m
Coefficients:[0m [0.024372060540788193,-0.00994841415379913,0.08473275115426164,-0.005427777776882383,0.012376634801657429,0.1063781184477319,0.06005283878638654,0.028247085746463865,0.025650418899828553,0.0378644737787286,-0.05997341676615515,0.024862205166703018,-0.00848626005530521,-0.011889884431524765,0.02588601566411619,0.030410385016060353,-0.010888508709059953,0.018631174222811617,0.08300561917499781,0.02309112408260959]
[1mIntercept: [0m 0.24966708207946361 [1m
Coefficients:[0m [-0.02437206054078813,0.009948414153799146,-0.08473275115426164,0.005427777776882416,-0.012376634801657401,-0.10637811844773191,-0.060052838786386516,-0.028247085746463778,-0.02565041889982852,-0.0378644737787285,0.0599734167661552,-0.02486220516670301,0.00848626005530525,0.011889884431524783,-0.025886015664116163,-0.03041038501606038,0.010888508709059993,-0.018631174222811568,-0.08300561917499777,-0.023091124082609552]
23/01/25 10:1

                                                                                

23/01/25 10:16:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mLinearSVC  Coefficients[0m
You should compares these relative to eachother
Coefficients: 
[-0.0844997878755985,0.031759943522102374,-0.3049258512353981,0.008460503514691,-0.057563266478105564,-0.3425060183956073,-0.21622175468071517,-0.10375193060056877,-0.10953043629235756,-0.15253763967428516,0.18653044415594597,-0.10315178887654208,0.023845538956622416,0.04126341495774034,-0.10281309383420112,-0.1265405015923651,0.03474181079366454,-0.08327828896703647,-0.30038963501897914,-0.1011199529848215]
23/01/25 10:16:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:51 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:16:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:17 WARN DAGScheduler: Broadcasting large task binary with size 1037.0 KiB
23/01/25 10:17:17 WARN DAGScheduler: Broadcasting large task binary with size 1617.4 KiB
23/01/25 10:17:18 WARN DAGScheduler: Broadcasting large task binary with size 1207.9 KiB
23/01/25 10:17:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:27 WARN DAGScheduler: Broadcasting large task binary with size 1013.8 KiB
23/01/25 10:17:28 WARN DAGScheduler: Broadcasting large task binary with size 1580.6 KiB
23/01/25 10:17:28 WARN DAGScheduler: Broadcasting large task binary with size 1186.8 KiB
23/01/25 10:17:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:39 WARN DAGScheduler: Broadcasting large task binary with size 1101.2 KiB


                                                                                

23/01/25 10:17:40 WARN DAGScheduler: Broadcasting large task binary with size 1784.1 KiB


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.045816234845427864,0.04098253540308568,0.06687495617517647,0.04699095348664945,0.03969000479288093,0.10167573363505063,0.05051365472398783,0.04980753930155541,0.03891218894711769,0.04029211889435241,0.0698382097608313,0.043292329886329764,0.04502876834610426,0.04635343756553399,0.04331428754526033,0.043463097677385445,0.039649284821531815,0.043139437864290066,0.06192914072346034,0.042436085603988295])
23/01/25 10:17:41 WARN DAGScheduler: Broadcasting large task binary with size 1332.0 KiB
23/01/25 10:17:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.

                                                                                

23/01/25 10:17:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:17:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:18:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:18:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:19:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:19:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:19:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:19:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.0394732237464082,0.05575602830415663,0.053544399595105756,0.045529455631759065,0.04002502470696043,0.05748024244844144,0.06435246237365781,0.062346426013698136,0.03979172346415989,0.04734425652369111,0.052835313176910544,0.03644563142783227,0.04943896044865084,0.044571335875499996,0.057979664882679034,0.05483458743998002,0.04845021591301769,0.05301451796779987,0.046729251326428044,0.0500572787331632])
23/01/25 10:19:58 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,2,4,5,6,7,8,10,12,15,18,19],[0.008101773017722172,0.22400260258264165,0.012857958649547608,0.3809726925253951,0.04158612397023884,0.030330022537158566,0.012662457049250824,0.10171669755393883,0.017610860848471178,0.013962837799693654,0.14174137698126102,0.014454596484680577])
23/01/25 10:20:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                


[1mMultilayerPerceptronClassifier  Weights[0m
[1mModel Weights: [0m 923

23/01/25 10:20:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

+------------------------------+------+
|Classifier                    |Result|
+------------------------------+------+
|LogisticRegression            |52.61 |
|OneVsRest                     |52.64 |
|LinearSVC                     |52.68 |
|NaiveBayes                    |52.31 |
|RandomForestClassifier        |52.63 |
|GBTClassifier                 |52.44 |
|DecisionTreeClassifier        |52.25 |
|MultilayerPerceptronClassifier|52.81 |
+------------------------------+------+

None
TFIDFfeaturizedData
23/01/25 10:20:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:20:58 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
23/01/25 10:20:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:14 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[-0.03426147,  0.0104125 , -0.0993271 ,  0.00769049, -0.01432663,
              -0.14472691, -0.07970345, -0.03973334, -0.03374675, -0.04482459,
               0.08229886, -0.0263043 ,  0.00949995,  0.01803923, -0.03383738,
              -0.03843415,  0.0111645 , -0.02645515, -0.09042841, -0.02685404]])
Intercept: [0.2653436836835483]
23/01/25 10:21:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:51 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:21:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mOneVsRest[0m
[1mIntercept: [0m -0.249667082079463 [1m
Coefficients:[0m [0.03246507546039185,-0.010562629378936851,0.09504097544809874,-0.008163709564499282,0.013423317634517627,0.13879114553387883,0.07618687999509817,0.037850802141731844,0.03206545497447886,0.0425421774131023,-0.07958194429040141,0.024809413973215743,-0.009735426079536903,-0.01801784151214573,0.031963542508819776,0.03638139877485752,-0.011087299579469473,0.02495048488525434,0.08662995339317171,0.025327351921092114]
[1mIntercept: [0m 0.24966708207946287 [1m
Coefficients:[0m [-0.03246507546039177,0.010562629378936879,-0.09504097544809872,0.008163709564499317,-0.01342331763451759,-0.1387911455338788,-0.07618687999509821,-0.037850802141731796,-0.032065454974478824,-0.04254217741310234,0.07958194429040147,-0.024809413973215743,0.009735426079536908,0.018017841512145624,-0.03196354250881977,-0.036381398774857535,0.011087299579469459,-0.02495048488525429,-0.08662995339317174,-0.025327351921092076]
23/01/25 10:22

                                                                                

23/01/25 10:22:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mLinearSVC  Coefficients[0m
You should compares these relative to eachother
Coefficients: 
[-0.11255888623685883,0.03372080286704003,-0.3420218268139701,0.012725114457987647,-0.06243134926406067,-0.4468663606673093,-0.27431277536747495,-0.13902651169869631,-0.13692342752699438,-0.17138184375996438,0.24751725373865135,-0.10293276140055654,0.027355570101081594,0.06253026892231268,-0.126951584124562,-0.1513864571977969,0.03537609001332689,-0.11152456980397185,-0.3135057643100441,-0.11091277437721996]
23/01/25 10:22:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:50 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:22:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:02 WARN DAGScheduler: Broadcasting large task binary with size 1058.7 KiB
23/01/25 10:23:02 WARN DAGScheduler: Broadcasting large task binary with size 1639.0 KiB
23/01/25 10:23:03 WARN DAGScheduler: Broadcasting large task binary with size 1229.1 KiB
23/01/25 10:23:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:12 WARN DAGScheduler: Broadcasting large task binary with size 1035.4 KiB
23/01/25 10:23:13 WARN DAGScheduler: Broadcasting large task binary with size 1602.2 KiB
23/01/25 10:23:14 WARN DAGScheduler: Broadcasting large task binary with size 1208.1 KiB
23/01/25 10:23:14 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:24 WARN DAGScheduler: Broadcasting large task binary with size 1114.4 KiB


                                                                                

23/01/25 10:23:25 WARN DAGScheduler: Broadcasting large task binary with size 1797.2 KiB


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.045816234845427864,0.04098253540308568,0.06687495617517647,0.04699095348664945,0.03969000479288093,0.10167573363505063,0.05051365472398783,0.04980753930155541,0.03891218894711769,0.04029211889435241,0.0698382097608313,0.043292329886329764,0.04502876834610426,0.04635343756553399,0.04331428754526033,0.043463097677385445,0.039649284821531815,0.043139437864290066,0.06192914072346034,0.042436085603988295])
23/01/25 10:23:27 WARN DAGScheduler: Broadcasting large task binary with size 1344.7 KiB
23/01/25 10:23:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.

                                                                                

23/01/25 10:23:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:23:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:24:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:24:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:25:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:25:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:25:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:25:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.0394732237464082,0.05575602830415663,0.053544399595105756,0.045529455631759065,0.04002502470696043,0.05748024244844144,0.06435246237365781,0.062346426013698136,0.03979172346415989,0.04734425652369111,0.052835313176910544,0.03644563142783227,0.04943896044865084,0.044571335875499996,0.057979664882679034,0.05483458743998002,0.04845021591301769,0.05301451796779987,0.046729251326428044,0.0500572787331632])
23/01/25 10:25:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:25:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,2,4,5,6,7,8,10,12,15,18,19],[0.008101773017722172,0.22400260258264165,0.012857958649547608,0.3809726925253951,0.04158612397023884,0.030330022537158566,0.012662457049250824,0.10171669755393883,0.017610860848471178,0.013962837799693654,0.14174137698126102,0.014454596484680577])
23/01/25 10:26:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:26:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                


[1mMultilayerPerceptronClassifier  Weights[0m
[1mModel Weights: [0m 923

23/01/25 10:26:51 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

+------------------------------+------+
|Classifier                    |Result|
+------------------------------+------+
|LogisticRegression            |52.61 |
|OneVsRest                     |52.64 |
|LinearSVC                     |52.68 |
|NaiveBayes                    |52.14 |
|RandomForestClassifier        |52.63 |
|GBTClassifier                 |52.44 |
|DecisionTreeClassifier        |52.25 |
|MultilayerPerceptronClassifier|52.78 |
+------------------------------+------+

None
W2VfeaturizedData
23/01/25 10:26:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
23/01/25 10:27:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[ 1.30810472, -1.80951787,  1.50797121]])

Intercept: [-0.6155394936241041]
23/01/25 10:27:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:50 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:56 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:27:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mOneVsRest[0m
[1mIntercept: [0m 0.5646789302812425 [1m
Coefficients:[0m [-1.2887835859435484,1.6614773116404817,-1.3073018508975234]
[1mIntercept: [0m -0.5646789302812423 [1m
Coefficients:[0m [1.2887835859435322,-1.6614773116405228,1.3073018508975758]
23/01/25 10:28:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mLinearSVC  Coefficients[0m
You should compares these relative to eachother
Coefficients: 
[4.194140194047077,-4.951400400584057,1.6072900905457346]
23/01/25 10:28:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:28:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:04 WARN DAGScheduler: Broadcasting large task binary with size 1287.8 KiB
23/01/25 10:29:05 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

23/01/25 10:29:06 WARN DAGScheduler: Broadcasting large task binary with size 1422.4 KiB
23/01/25 10:29:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:15 WARN DAGScheduler: Broadcasting large task binary with size 1253.5 KiB
23/01/25 10:29:16 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

23/01/25 10:29:17 WARN DAGScheduler: Broadcasting large task binary with size 1359.8 KiB
23/01/25 10:29:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.38750004546505934,0.528986547761474,0.08351340677346668])
23/01/25 10:29:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:29:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:30:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:30:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.3909004897046106,0.4124138706995312,0.19668563959585816])
23/01/25 10:31:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:31:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:32:01 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:32:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:32:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:32:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:32:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.3912169716714286,0.5538695637905796,0.05491346453799188])
23/01/25 10:32:14 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:32:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                


[1mMultilayerPerceptronClassifier  Weights[0m
[1mModel Weights: [0m 39

23/01/25 10:32:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

+------------------------------+------+
|Classifier                    |Result|
+------------------------------+------+
|LogisticRegression            |55.33 |
|OneVsRest                     |55.37 |
|LinearSVC                     |54.48 |
|NaiveBayes                    |50.42 |
|RandomForestClassifier        |57.19 |
|GBTClassifier                 |57.12 |
|DecisionTreeClassifier        |56.95 |
|MultilayerPerceptronClassifier|57.08 |
+------------------------------+------+

None


In [144]:
classifier = RandomForestClassifier()
featureDF = W2VfeaturizedData

train, test = featureDF.randomSplit([0.7, 0.3], seed=11)
features = featureDF.select(['features']).collect()

class_count = featureDF.select(countDistinct('label')).collect()
classes = class_count[0] [0]

ClassTrainEval(classifier, features, classes, train, test)

23/01/25 10:44:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv
23/01/25 10:44:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:26 WARN DAGScheduler: Broadcasting large task binary with size 1287.8 KiB
23/01/25 10:44:27 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/01/25 10:44:28 WARN DAGScheduler: Broadcasting large task binary with size 1422.4 KiB
23/01/25 10:44:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:36 WARN DAGScheduler: Broadcasting large task binary with size 1253.5 KiB
23/01/25 10:44:37 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

23/01/25 10:44:38 WARN DAGScheduler: Broadcasting large task binary with size 1359.8 KiB
23/01/25 10:44:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

23/01/25 10:44:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.38750004546505934,0.528986547761474,0.08351340677346668])
23/01/25 10:44:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

DataFrame[Classifier: string, Result: string]

In [147]:
predictions = RF_BestModel.transform(test)
print("Predicted Failures:")
predictions.select("state","blurb").filter("prediction=0").orderBy(predictions["prediction"].desc()).show(3,False)
print(" ")
print("Predicted Success:")
predictions.select("state","blurb").filter("prediction=1").orderBy(predictions["prediction"].desc()).show(3,False)

Predicted Failures:
23/01/25 10:51:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , blurb, state
 Schema: _c0, blurb, state
Expected: _c0 but found: 
CSV file: file:///home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/kickstarter.csv


                                                                                

+------+--------------------------------------------------------------------------------------------------------------------------------+
|state |blurb                                                                                                                           |
+------+--------------------------------------------------------------------------------------------------------------------------------+
|failed|a san francisco based blog where artists from all over can come together to find inspiration as well as support each other      |
|failed|aspiring canadian music producer and singersong writer who lives and loves music and art creating a debut alternative hip hop ep|
|failed|everyday for the next days i will be creating a new digital portrait ive done so far the feedbacks been absolutely amazing      |
+------+--------------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows

 
Predict



+------+-------------------------------------------------------------------------------------------------------------------------------+
|state |blurb                                                                                                                          |
+------+-------------------------------------------------------------------------------------------------------------------------------+
|failed| one pen stroke at a time                                                                                                      |
|failed|a pilot program of five students focused on the interactions between ecology and humanity in the ceramic village of tamba japan|
|failed|a public art studio embracing various fine arts and encouraging cultural crafts experienced students challenged art craft shows|
+------+-------------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



                                                                                

In [148]:
spark.stop()