#training-start

In [1]:
##----------------training--------------------------------

# Load the Pandas libraries with alias 'pd' 
import pandas as pd 
# DATASET
DATASET_COLUMNS = ["Sentiment","Sentiment Text"]
DATASET_ENCODING = "ISO-8859-1"

# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
data = pd.read_csv("/Users/pranshushrivastava/Downloads/sentiment_dataset2.csv", encoding =DATASET_ENCODING , names=DATASET_COLUMNS) 
# Preview the first 5 lines of the loaded data 
data.head()

Unnamed: 0,Sentiment,Sentiment Text
0,0,is so sad for my apl frie...
1,0,i missed the new moon trail...
2,1,omg its already 7:30 :o
3,0,.. omgaga. im sooo im gunna cry. i'...
4,0,i think mi bf is cheating on me!!! ...


In [2]:
df = sqlContext.createDataFrame(data)
type(df)
df.show(truncate=False)

+---------+----------------------------------------------------------------------------------------------------------------------------------------+
|Sentiment|Sentiment Text                                                                                                                          |
+---------+----------------------------------------------------------------------------------------------------------------------------------------+
|0        |                     is so sad for my apl friend.............                                                                           |
|0        |                   i missed the new moon trailer...                                                                                     |
|1        |              omg its already 7:30 :o                                                                                                   |
|0        |          .. omgaga. im sooo  im gunna cry. i've been at this dentist since 11.. i was suposed 

In [3]:
(train_set, test_set) = df.randomSplit([0.98, 0.02], seed = 2500)

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
tokenizer = Tokenizer(inputCol="Sentiment Text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tf")
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "Sentiment", outputCol = "label")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, label_stringIdx, lr])

In [5]:
model = pipeline.fit(train_set)

In [6]:
predictions = model.transform(test_set)

In [7]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.7965284903508751

In [8]:
evaluator.getMetricName()

'areaUnderROC'

#training-end
#Load twitter data and perform cleaning

In [9]:
sc.stop()
import pyspark
from pyspark import SparkContext
import re
sc = pyspark.SparkContext(appName="sentiAnalysis")
from pyspark.sql.types import StringType
from pyspark import SQLContext
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
sqlContext = SQLContext(sc)

#DataSet_rdd = sc.textFile("/Users/pranshushrivastava/DevData/testData2.csv").map(lambda row: row.split("&amp;"))
DataSet_rdd = sc.textFile("/Users/pranshushrivastava/DevData/prod_filtered_data_2.csv").map(lambda row: row.split(","))

DataSet_rdd.collect()

DataSet_df = DataSet_rdd.toDF(['Brand','State','Comment'])

DataSet_df.show()
print(type(DataSet_df))

#Register dataframe as table
DataSet_df.createOrReplaceTempView("twitterTable")

#Create list of DFs containing similar brands
distinctBrands = sqlContext.sql("select distinct Brand from twitterTable")
distinctBrandsList = [str(row.Brand) for row in distinctBrands.collect()]
newDFList=[]
for brands in distinctBrandsList:
	newDFList.append(DataSet_df.where(DataSet_df.Brand == brands))
print(newDFList)

#Create list of tweets
tweets = sqlContext.sql("select Comment from twitterTable")
tweetsList = [str(row.Comment) for row in tweets.collect()]
print(tweetsList)

distributedList = sc.parallelize(tweetsList)
distributedList.map(lambda x: x.lower()).collect()
F1 = udf(lambda x: x.lower(), StringType())
DataSet_df=DataSet_df.withColumn("Comment", F1(DataSet_df["Comment"])).select('Brand','State','Comment')
DataSet_df.show()
F2 = udf(lambda x: re.sub('[^A-Za-z]+', ' ',x), StringType())
DataSet_df=DataSet_df.withColumn("Comment", F2(DataSet_df["Comment"])).select('Brand','State','Comment')
DataSet_df.show()


#Remove stop-words
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType, StringType

F1 = udf(lambda x: x.split(" "), ArrayType(StringType()))
DataSet_df=DataSet_df.withColumn("Comment", F1(DataSet_df["Comment"])).select('Brand','State','Comment')
DataSet_df.show()

remover = StopWordsRemover(inputCol="Comment", outputCol="Filtered")
temp_df = remover.transform(DataSet_df)

+-----+-----+--------------------+
|Brand|State|             Comment|
+-----+-----+--------------------+
| nike|   CA|bring back the fl...|
| nike|   CA|fuck i want some ...|
| nike|   CA|one of my first s...|
| nike|   CA|the beauty of the...|
| nike|   CA| g rollo    hones...|
| nike|   CA|debuted in the   ...|
| nike|   CA|the new  angels  ...|
| nike|   CA|i should do a dea...|
| nike|   CA|push day yesterda...|
| nike|   CA| jbridges     nfl...|
| nike|   CA|i need a new nike...|
| nike|   CA|can  nike spare s...|
| nike|   CA|i have stocks in ...|
| nike|   CA| bookboy      jan...|
| nike|   CA|    i am so much ...|
| nike|   CA|i think i ve had ...|
| nike|   CA|really  nike    e...|
| nike|   CA|nike tights  gt  ...|
| nike|   CA|you can t see it ...|
| nike|   CA|i m still mad as ...|
+-----+-----+--------------------+
only showing top 20 rows

<class 'pyspark.sql.dataframe.DataFrame'>
[DataFrame[Brand: string, State: string, Comment: string], DataFrame[Brand: string, State: s

+-----+-----+--------------------+
|Brand|State|             Comment|
+-----+-----+--------------------+
| nike|   CA|bring back the fl...|
| nike|   CA|fuck i want some ...|
| nike|   CA|one of my first s...|
| nike|   CA|the beauty of the...|
| nike|   CA| g rollo    hones...|
| nike|   CA|debuted in the   ...|
| nike|   CA|the new  angels  ...|
| nike|   CA|i should do a dea...|
| nike|   CA|push day yesterda...|
| nike|   CA| jbridges     nfl...|
| nike|   CA|i need a new nike...|
| nike|   CA|can  nike spare s...|
| nike|   CA|i have stocks in ...|
| nike|   CA| bookboy      jan...|
| nike|   CA|    i am so much ...|
| nike|   CA|i think i ve had ...|
| nike|   CA|really  nike    e...|
| nike|   CA|nike tights  gt  ...|
| nike|   CA|you can t see it ...|
| nike|   CA|i m still mad as ...|
+-----+-----+--------------------+
only showing top 20 rows

+-----+-----+--------------------+
|Brand|State|             Comment|
+-----+-----+--------------------+
| nike|   CA|bring back the f

In [10]:
from pyspark.sql.functions import udf, col

join_udf = udf(lambda x: ",".join(x), StringType())
temp_df = temp_df.withColumn("Sentiment Text", join_udf(col("filtered")))
temp_df = temp_df.drop("filtered")
temp_df.show()

+-----+-----+--------------------+--------------------+
|Brand|State|             Comment|      Sentiment Text|
+-----+-----+--------------------+--------------------+
| nike|   CA|[bring, back, the...|bring,back,flykni...|
| nike|   CA|[fuck, i, want, s...|fuck,want,nike,me...|
| nike|   CA|[one, of, my, fir...|one,first,songs,t...|
| nike|   CA|[the, beauty, of,...|beauty,day,airmax...|
| nike|   CA|[, g, rollo, hone...|,g,rollo,honestly...|
| nike|   CA|[debuted, in, the...|debuted,film,back...|
| nike|   CA|[the, new, angels...|    new,angels,nike,|
| nike|   CA|[i, should, do, a...|deal,nike,cuz,che...|
| nike|   CA|[push, day, yeste...|push,day,yesterda...|
| nike|   CA|[, jbridges, nflh...|,jbridges,nflhumo...|
| nike|   CA|[i, need, a, new,...|need,new,nike,yan...|
| nike|   CA|[can, nike, spare...|nike,spare,runnin...|
| nike|   CA|[i, have, stocks,...|stocks,apple,nike...|
| nike|   CA|[, bookboy, janec...|,bookboy,janecrai...|
| nike|   CA|[, i, am, so, muc...|,much,royalty,

In [11]:
prediction = model.transform(temp_df)
selected = prediction.select("Brand","State","Sentiment Text", "probability", "prediction")
for row in selected.collect():
    brand, state, senti, prob, prediction = row    
    print("(%s, %s, %s) --> prob=%s, prediction=%f" % (brand, state, senti, str(prob), prediction))
    

(nike, CA, bring,back,flyknit,racer,nike) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, CA, fuck,want,nike,metcons,gym,fcukdwjjdksn,fuck) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, CA, one,first,songs,tripleyothreat,m,star,star,ring,really,spoke,look,sun,skyline,horizon,hiphop,star,jewelry,shine,smile,rap,rapper,chains,nike,denim,https,co,u,thzsushu) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, CA, beauty,day,airmax,sketchtoshelf,nike,https,co,qkkkoxzsn,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, CA, ,g,rollo,honestly,oregon,success,since,made,massively,risky,massively,successful,moves,bringing,chip,kelly,oc,making,head,coach,nike,pk,stuff,gross,helpful,chip,catalyst,real,rise,past,years) --> prob=[0.689821049745025,0.31017895025497494], prediction=0.000000
(nike, CA, debuted,film,back,future,ii,nike,air,mag,released,amp,limited,pairs,sold,high,tech,featur

(adidas, AZ, paul,pogba,going,milk,injury,transfer,window,opens,blame,lineup,leaves,much,desired,manutd,mufc,still,say,adidas,nike,thing,lol,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, AZ, holy,shit,x,wing,adidas,ultra,boosts,tight,af,one,thing,added,xmas,list,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, AZ, ndfootball,throwback,uniforms,look,better,adidas) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, AZ, ,party,totallytubular,adidas,cheery,lynn,historic,district,phoenix,https,co,g,wyazfkoc) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, AZ, dont,enough,nike,adidas,armour,complete,whole,outfit,im,sry) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, AZ, ,bnans,like,shirt,trip,adidas,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, AZ, know,winter,tempe,w,p,carey,students,wear,adidas,sweat

(nike, NV, hello,nevadans,help,renew,itin,pro,bono,ask,us,unlvlaw,sooner,better,end,year,time,essence,taxpayers,renew,lose,important,benefits,nike,maketheroadnv,dreambignv,nvimmigrants,https,co,imci,lplfg) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, NV, ,lvaces,nike,wnba,eighth,grade,girl,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, NV, nike,sis,https,co,avwzgzkdgr) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, NV, going,get,lil,sister,nike,tech,christmas) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, NV, listen,freaky,nike,ain,one) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, NV, freaky,nike,character,song,listen,daily,bitch,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, NV, thankful,happy,chiefs,win,monday,night,football,game,realtalk,truth,nfl,nflfan,missouri,showmestate,kc,kcchiefs,chiefs,chi

(adidas, NY, ,briannnnf,adidas,prob,rushing,print,chop,stuff,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, NY, jeremy,scott,adidas,https,co,lm,yhpayfz) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, NY, brand,new,adidas,yeezy,boost,v,cloud,size,topshelfkicks,topshelf,kicks,https,co,zihgxfd) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, NY, ,marxman,adidas,ogyayaa,yes,pulling,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, NY, pre,owned,adidas,pw,human,race,nmd,tr,size,open,pm,topshelfkicks,topshelf,kicks,https,co,wwr,nhz) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, NY, ,princess,tbt,black,hair,girlfriend,also,time,used,walk,around,without,jackets,remember,adidas,threestripes,central,park,https,co,akxxbb,u) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, NY, family,time,party,promotions,grea

(adidas, GA, jubean,cousins,atlanta,support,adidas,cup,soccer,tournament,thank,jaysonbellamy,marietta,georgia,https,co,gvinuxhfxr) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, GA, southwest,dekalb,girls,vs,frederick,douglas,girls,basketball,highschoolhoops,girlsbasketball,ghsa,georgiabasketball,nike,adidas,underarmour,benjamin,elijah,mays,high,school,https,co,ydvucm,ye) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, GA, ,julianjubeanhall,got,meet,cousins,atlanta,first,time,adidas,cup,soccer,tournament,thank,jaysonbellamy,supporting,family,therealcoachroc,tharealme,marietta,georgia,https,co,molx,lny) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, GA, wheeler,vs,laney,augusta,girls,basketball,highschoolhoops,georgiabasketball,nike,adidas,underarmour,truetoatlanta,girlsbasketball,benjamin,elijah,mays,high,school,https,co,jd,x,j,de) --> prob=[0.7194027778310298,0.2805972221689702], prediction=

(nike, TN, ,qc,nike,nigga,cold,put,jacket,back,caught,fit,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, literally,get,baby,nike,puma,etc,habit,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, ,xoneshia,kmariah,freaky,nike,sada,baby) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, ,joanne,nike,theeibnetwork,million,dollar,idea) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, ,joanne,nike,theeibnetwork,make,sneaker,well,shirts) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, ,nike,air,max,cloud,sighting,nc,o,boy,nashvillered,lets,hang,museum,https,co,dprnv,sj,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, ,licensed,toill,bluethegreat,nike,nikestore,told,wasn,letting,get,away,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, TN, day,saw,dude,work,take,shoes,

(nike, IL, okay,m,realizing,font,touching,previous,uniforms,still,college,influence,m,sure,mainly,nike,logo,overall,fan,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, IL, okay,m,realizing,font,touching,previous,uniforms,still,college,influence,m,sure,mainly,nike,logo,overall,fan,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, IL, ,adidasrunning,adidas,want,bad,santa,) --> prob=[0.4968716026006554,0.5031283973993447], prediction=1.000000
(adidas, IL, ,naroiv,adidas,ultra,boost,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, IL, us,bruised,banana,kit,back,stock,adidas,website,amp,xs,psa) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(adidas, IL, ,heartbreakrunco,got,set,first,weekend,long,run,boston,also,emilymarathons,may,matching,stocking,hats,winter,heartbreakersrun,bostonmarathon,adidas,https,co,jau,ooqj,b) --> prob=[0.6328108970128357,0.3671891029871644], predict

(nike, MI, cop,soleboy,fit,good,nike,tech,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, ,staceythagreat,best,ones,winter,u,ever,try,wear,timbs,w,nike,tech,looks,weird,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, pissed,year,calling,nike,seconds,left,play,clock) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, mention,cyber,monday,hope,nike,dope,j,sale) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, bought,mama,fire,ass,nike,fit,wish,size,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, ,donnyknoll,barrysanders,nike,trade,size,) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, nike,sent,shirts,robtherich) --> prob=[0.6328108970128357,0.3671891029871644], prediction=0.000000
(nike, MI, m,retiring,nike,adidasoriginals,) --> prob=[0.5063058604227691,0.49369413957723085], prediction=0.00

In [12]:
#predictions = model.transform(temp_df)
#type(predictions)
#predictions.show()
#type(prediction)
#prediction.show()

#selected.select("prediction").distinct().show()
selected.select("probability").show(truncate=False)
type(selected)

+----------------------------------------+
|probability                             |
+----------------------------------------+
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.689821049745025,0.31017895025497494] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.8111021447495267,0.18889785525047328]|
|[0.6328108970128357,0.3671891029871644] |
|[0.6328108970128357,0.3671891029871644] |
|[0.8726333396009943,0.1273666603990057] |
+----------

pyspark.sql.dataframe.DataFrame

In [13]:
selected.createOrReplaceTempView("PredictionTable")
probability = sqlContext.sql("select probability from PredictionTable")

probabilityList = [(row.probability) for row in probability.collect()]
type (probabilityList[0][0])

numpy.float64

In [14]:

sumOfProbabilities = 0
count = 0

for list in probabilityList:
    sumOfProbabilities+=list[0]
    count+=1
print("perception of nike in california:", sumOfProbabilities/count)


perception of nike in california: 0.6341749455813036


In [15]:
#fill = selected.selectExpr("Brand","State","rawPrediction","probability","prediction as label")
#fill = selected.selectExpr("rawPrediction","prediction as label")

In [16]:
#evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
#evaluator.evaluate(fill)

In [17]:
#df1_pd = pd.DataFrame(selected, columns=selected.columns(enumerate("probability")))

In [18]:
from pyspark.sql.functions import udf, col
finalize = selected.selectExpr("Brand","State", "prediction")

    
#F1 = udf(lambda x: sentiment_calc(x), FloatType())
#finalize=finalize.withColumn("etc", F1(col("probability")))
finalize.select("prediction").distinct().show()


+----------+
|prediction|
+----------+
|       0.0|
|       1.0|
+----------+



In [25]:
#finalDf = predictions.select("probability")
#final = sqlContext.sql("select probability from predictions groupBy Brand and State")
from pyspark.sql.functions import avg
from pyspark.sql import functions as F
BrandPerception = selected.groupBy("Brand", "State").agg((F.mean('prediction')).alias("Prediction"))
#print(BrandPerception)

BrandPerception.show()
#finalize = selected.groupBy("Brand", "State").avg("probability")

+------+-----+--------------------+
| Brand|State|          Prediction|
+------+-----+--------------------+
|adidas|   SD|                 0.0|
|  nike|   PA|0.005405405405405406|
|adidas|   IL| 0.02040816326530612|
|adidas|   WI|                 0.0|
|  nike|   OR|0.030612244897959183|
|  nike|   IL|0.013245033112582781|
|  nike|   AZ|0.005681818181818182|
|adidas|   CT|                 0.0|
|  nike|   NJ| 0.02666666666666667|
|adidas|   VA|                 0.0|
|adidas|   GA|0.014285714285714285|
|  nike|   OH|                 0.0|
|adidas|   MA|                0.12|
|  nike|   NE|                 0.0|
|adidas|   MD|                 0.0|
|adidas|   NV| 0.03333333333333333|
|  nike|   CA|               0.014|
|adidas|   WA|                 0.0|
|adidas|   TN|0.043478260869565216|
|adidas|   WV|                 0.0|
+------+-----+--------------------+
only showing top 20 rows



In [26]:
import pyspark.sql.functions as func
BrandPerception = BrandPerception.withColumn("Prediction", func.round(BrandPerception["Prediction"], 2))
BrandPerception.show()

+------+-----+----------+
| Brand|State|Prediction|
+------+-----+----------+
|adidas|   SD|       0.0|
|  nike|   PA|      0.01|
|adidas|   IL|      0.02|
|adidas|   WI|       0.0|
|  nike|   OR|      0.03|
|  nike|   IL|      0.01|
|  nike|   AZ|      0.01|
|adidas|   CT|       0.0|
|  nike|   NJ|      0.03|
|adidas|   VA|       0.0|
|adidas|   GA|      0.01|
|  nike|   OH|       0.0|
|adidas|   MA|      0.12|
|  nike|   NE|       0.0|
|adidas|   MD|       0.0|
|adidas|   NV|      0.03|
|  nike|   CA|      0.01|
|adidas|   WA|       0.0|
|adidas|   TN|      0.04|
|adidas|   WV|       0.0|
+------+-----+----------+
only showing top 20 rows



In [27]:
row1 = BrandPerception.agg({"Prediction": "max"}).collect()[0]
#print (row1)

#print (row1["max(Prediction)"])
max = row1["max(Prediction)"]
print (max)
#type(max)

0.17


In [28]:
F1 = udf(lambda x: x/max, FloatType())
BrandPerception=BrandPerception.withColumn("Prediction", F1(BrandPerception["Prediction"]))
BrandPerception.show(truncate=False)

+------+-----+----------+
|Brand |State|Prediction|
+------+-----+----------+
|adidas|SD   |0.0       |
|nike  |PA   |0.05882353|
|adidas|IL   |0.11764706|
|adidas|WI   |0.0       |
|nike  |OR   |0.1764706 |
|nike  |IL   |0.05882353|
|nike  |AZ   |0.05882353|
|adidas|CT   |0.0       |
|nike  |NJ   |0.1764706 |
|adidas|VA   |0.0       |
|adidas|GA   |0.05882353|
|nike  |OH   |0.0       |
|adidas|MA   |0.7058824 |
|nike  |NE   |0.0       |
|adidas|MD   |0.0       |
|adidas|NV   |0.1764706 |
|nike  |CA   |0.05882353|
|adidas|WA   |0.0       |
|adidas|TN   |0.23529412|
|adidas|WV   |0.0       |
+------+-----+----------+
only showing top 20 rows



In [29]:
import pyspark.sql.functions as func
BrandPerception = BrandPerception.withColumn("Prediction", func.round(BrandPerception["Prediction"], 2))
BrandPerception.show()

+------+-----+----------+
| Brand|State|Prediction|
+------+-----+----------+
|adidas|   SD|       0.0|
|  nike|   PA|      0.06|
|adidas|   IL|      0.12|
|adidas|   WI|       0.0|
|  nike|   OR|      0.18|
|  nike|   IL|      0.06|
|  nike|   AZ|      0.06|
|adidas|   CT|       0.0|
|  nike|   NJ|      0.18|
|adidas|   VA|       0.0|
|adidas|   GA|      0.06|
|  nike|   OH|       0.0|
|adidas|   MA|      0.71|
|  nike|   NE|       0.0|
|adidas|   MD|       0.0|
|adidas|   NV|      0.18|
|  nike|   CA|      0.06|
|adidas|   WA|       0.0|
|adidas|   TN|      0.24|
|adidas|   WV|       0.0|
+------+-----+----------+
only showing top 20 rows



In [24]:
BrandPerception.toPandas().to_csv('BrandPerception-final5.csv')