In [2]:
rawData=sc.textFile("/home/stan/Downloads/train_noheader.tsv")

In [3]:
records=rawData.map(lambda line: line.split("\t"))

In [4]:
records.first()

[u'"http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html"',
 u'"4042"',
 u'"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world 

In [5]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors

In [6]:
def convert (x):
    if x=="?":
        return 0.0
    else:
        return float(x)

In [7]:
def createLabeledPoint(r):
    trimmed=map(lambda x: x.replace("\"",""),r)
    label=int(trimmed[len(r)-1])
    features=map(lambda x: convert(x),trimmed[4:len(r)-1])
    return LabeledPoint(label, Vectors.dense(features))

In [8]:
createLabeledPoint(records.first())


LabeledPoint(0.0, [0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])

In [9]:
data=records.map(lambda r: createLabeledPoint(r))

In [10]:
data.cache()

PythonRDD[4] at RDD at PythonRDD.scala:43

In [11]:
data.count()

7395

In [12]:
def positiveConvert (x):
    if x=="?":
        return 0.0
    else:
        tmp=float(x)
        if tmp < 0.0:
            return 0.0
        else:
            return tmp
        

In [13]:
def createPositiveLabeledPoint(r):
    trimmed=map(lambda x: x.replace("\"",""),r)
    label=int(trimmed[len(r)-1])
    features=map(lambda x: positiveConvert(x),trimmed[4:len(r)-1])
    return LabeledPoint(label, Vectors.dense(features))

In [14]:
nbData=records.map(lambda x: createPositiveLabeledPoint(x))

In [15]:
nbData.first()

LabeledPoint(0.0, [0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])

In [16]:
nbData.count()

7395

In [17]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.tree import DecisionTree


In [18]:
numIterations=10
maxTreeDepth =5

In [19]:
lrModel= LogisticRegressionWithSGD.train(data,numIterations)

In [20]:
svmModel = SVMWithSGD.train(data,numIterations)

In [21]:
nbModel = NaiveBayes.train(nbData)

In [22]:
dtModel=DecisionTree.trainClassifier(data,2,{})

# Use logisticRegression to classify

In [23]:
dataPoint = data.first()
prediction=lrModel.predict(dataPoint.features)

In [24]:
prediction

1

In [25]:
trueLabel = dataPoint.label

In [26]:
trueLabel

0.0

In [27]:
predictions = lrModel.predict(data.map(lambda x: x.features))

In [28]:
predictions.take(5)

[1, 1, 1, 1, 1]

## Accuracy and prediction error

In [29]:
from operator import add

In [30]:
lrTotalCorrect=data.map(lambda x: lrModel.predict(x.features)==x.label).reduce(add)

In [31]:
lrTotalCorrect

3806

In [32]:
lrAccuracy=lrTotalCorrect*1.0/data.count()

In [33]:
lrAccuracy

0.5146720757268425

In [34]:
3806.0/7395

0.5146720757268425

In [35]:
svmTotalCorrect=data.map(lambda x: svmModel.predict(x.features)==x.label).reduce(add)
nbTotalCorrect=nbData.map(lambda x: nbModel.predict(x.features)==x.label).reduce(add)

In [36]:
dtModel.predict(data.first().features)

0.0

In [37]:
dtTotalCorrect=data.map(lambda x: x.features)

In [38]:
dtModel

DecisionTreeModel classifier of depth 5 with 61 nodes

In [39]:
dtModel.predict(data.first().features)

0.0

In [40]:
dtTotalCorrect=sum(map(lambda x: dtModel.predict(x.features),data.collect()))

In [41]:
dtTotalCorrect

3549.0

In [42]:
svmAccuracy=svmTotalCorrect*1.0 / data.count()

In [43]:
svmAccuracy

0.5146720757268425

In [44]:
nbTotalCorrect

True

In [45]:
nbTotalCorrect=nbData.map(lambda x: nbModel.predict(x.features)==x.label).collect()

In [46]:
nbAccuracy=sum(nbTotalCorrect)*1.0/nbData.count()

In [47]:
nbAccuracy

0.58039215686274515

In [48]:
dtAccuracy=dtTotalCorrect/data.count()

In [49]:
dtAccuracy

0.47991886409736306

##Precision and Recall, True positive rate and false positive rate, PR curve and ROC curve

In [50]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [51]:
models=[lrModel,svmModel]

In [52]:
models

[(weights=[-0.110216274454,-0.493200344739,-0.0712665620384,-0.0214744216778,0.00276706475384,0.00246385887598,-1.33300460292,0.0525232672351,0.0,-0.0320576776,-0.00653638798541,-0.0613702511674,-0.14975863133,-0.13648187383,-0.121161700009,-15.6451616669,-0.0177690355464,745.987958686,-7.73567729685,-1.38587998188,-0.0355600416613,-0.0352085128613], intercept=0.0),
 (weights=[-0.122188386978,-0.527510758159,-0.0742371782434,-0.0206667449306,0.00546395033577,0.00409811283781,-1.54824523474,0.0607028905087,0.0,-0.037008323802,-0.007374037142,-0.067970375864,-0.172289581054,-0.148716595522,-0.129369384966,-18.0315472516,-0.0202704220321,1025.48043141,-5.05188911633,-1.54111193167,-0.038689478606,-0.0397619278886], intercept=0.0)]

In [53]:
scoreAndLabels=data.map(lambda x: (lrModel.predict(x.features)*1.0,x.label*1.0))
metrics = BinaryClassificationMetrics(scoreAndLabels)
m1=(lrModel.__class__.__name__,metrics.areaUnderPR,metrics.areaUnderROC)

In [54]:
all_metrics=[]

In [55]:
all_metrics.append(m1)

In [56]:
all_metrics

[('LogisticRegressionModel', 0.7567586293858841, 0.5014181143280931)]

In [57]:
scoreAndLabels=data.map(lambda x: (svmModel.predict(x.features)*1.0,x.label*1.0))
metrics = BinaryClassificationMetrics(scoreAndLabels)
m2=(svmModel.__class__.__name__,metrics.areaUnderPR,metrics.areaUnderROC)
all_metrics.append(m2)

In [58]:
scoreAndLabels=data.map(lambda x: (float(nbModel.predict(x.features)*1.0),x.label*1.0))
metrics = BinaryClassificationMetrics(scoreAndLabels)
m3=(nbModel.__class__.__name__,metrics.areaUnderPR,metrics.areaUnderROC)
all_metrics.append(m3)

In [59]:
all_metrics

[('LogisticRegressionModel', 0.7567586293858841, 0.5014181143280931),
 ('SVMModel', 0.7567586293858841, 0.5014181143280931),
 ('NaiveBayesModel', 0.6810027809268857, 0.5836830187287125)]

In [60]:
dtModel.predict(data.first().features)

0.0

In [61]:
a=data.map(lambda x: dtModel.predict(x.features))

In [62]:
dtModel

DecisionTreeModel classifier of depth 5 with 61 nodes

In [63]:
from pyspark.mllib.linalg.distributed import RowMatrix

In [64]:
vectors= data.map(lambda x: x.features)

In [65]:
matrix = RowMatrix(vectors)

In [66]:
matrixSummary = matrix.numRows()

In [67]:
matrixSummary

7395L

In [68]:
from pyspark.mllib.feature import StandardScaler

In [69]:
scaler = StandardScaler(withMean = True, withStd = True).fit(vectors)

In [70]:
scaler

<pyspark.mllib.feature.StandardScalerModel at 0x7f0041fdb350>

In [71]:
scaledData=map(lambda x: LabeledPoint(x.label,scaler.transform(x.features)),data.collect())

In [72]:
scaledData=sc.parallelize(scaledData)

In [73]:
scaledData.first().features

DenseVector([1.1376, -0.0819, 1.0251, -0.0559, -0.4689, -0.3543, -0.3175, 0.3385, 0.0, 0.8288, -0.1473, 0.2296, -0.1416, 0.7902, 0.7172, -0.298, -0.2035, -0.033, -0.0488, 0.9401, -0.1087, -0.2788])

In [74]:
lrModelScaled = LogisticRegressionWithSGD.train(scaledData,numIterations)

In [75]:
lrTotalCorrectScaled = scaledData.map(lambda x: lrModelScaled.predict(x.features)==x.label).collect()

In [79]:
lrAccuracyScaled = sum(lrTotalCorrectScaled)*1.0/data.count()

In [80]:
lrAccuracyScaled

0.6209601081812035

In [84]:
lrPredictionVsTrue = scaledData.map(lambda x: (lrModelScaled.predict(x.features)*1.0,x.label*1.0))

In [85]:
lrMetricsScaled = BinaryClassificationMetrics(lrPredictionVsTrue)

In [86]:
lrPr = lrMetricsScaled.areaUnderPR

In [87]:
lrPr


0.7277006868096805

In [88]:
lrRoc=lrMetricsScaled.areaUnderROC

In [89]:
lrRoc

0.6201898373011353

#Adding another categorical feature

In [91]:
records.count()

7395

In [93]:
range(1,10)

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [96]:
records.first()[3]

u'"business"'

In [98]:
categories=records.map(lambda x: x[3]).distinct().collect()

In [99]:
len(categories)

14

In [113]:
categories=dict(zip(categories,range(0,len(categories))))

In [114]:
categories

{u'"?"': 2,
 u'"arts_entertainment"': 11,
 u'"business"': 3,
 u'"computer_internet"': 1,
 u'"culture_politics"': 6,
 u'"gaming"': 7,
 u'"health"': 4,
 u'"law_crime"': 5,
 u'"recreation"': 0,
 u'"religion"': 10,
 u'"science_technology"': 12,
 u'"sports"': 8,
 u'"unknown"': 9,
 u'"weather"': 13}

In [115]:
numCategories = len(categories)

In [116]:
numCategories

14

In [128]:
def createCatsLabeledPoint(r):
    trimmed=map(lambda x: x.replace("\"",""),r)
    label=int(trimmed[len(r)-1])
    categoryIdx = categories[r[3]]
    categoryFeatures = [0.0]*numCategories
    categoryFeatures[categoryIdx]=1.0
    otherFeatures=map(lambda x: convert(x),trimmed[4:len(r)-1])
    features=categoryFeatures+otherFeatures
    return LabeledPoint(label, Vectors.dense(features))

In [129]:
dataCategories=records.map(lambda x: createCatsLabeledPoint(x))

In [130]:
dataCategories.first()

LabeledPoint(0.0, [0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])

In [131]:
scalerCats = StandardScaler(withMean = True, withStd= True).fit(dataCategories.map(lambda x: x.features))

In [139]:
scaledDataCats=map(lambda x: LabeledPoint(x.label,scalerCats.transform(x.features)),dataCategories.collect())

In [140]:
scaledDataCats= sc.parallelize(scaledDataCats)

In [141]:
dataCategories.first().features

DenseVector([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7891, 2.0556, 0.6765, 0.2059, 0.0471, 0.0235, 0.4438, 0.0, 0.0, 0.0908, 0.0, 0.2458, 0.0039, 1.0, 1.0, 24.0, 0.0, 5424.0, 170.0, 8.0, 0.1529, 0.0791])

In [142]:
scaledDataCats.first().features

DenseVector([-0.4464, -0.2042, -0.6808, 2.7207, -0.271, -0.0649, -0.2205, -0.1019, -0.2327, -0.0285, -0.0991, -0.3818, -0.2017, -0.0233, 1.1376, -0.0819, 1.0251, -0.0559, -0.4689, -0.3543, -0.3175, 0.3385, 0.0, 0.8288, -0.1473, 0.2296, -0.1416, 0.7902, 0.7172, -0.298, -0.2035, -0.033, -0.0488, 0.9401, -0.1087, -0.2788])

In [147]:
lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats,numIterations)

In [148]:
lrTotalCorrectScaledCats = scaledDataCats.map(lambda x: lrModelScaledCats.predict(x.features)==x.label).collect()

In [151]:
lrAccuracyScaledCats = sum(lrTotalCorrectScaledCats)*1.0/scaledDataCats.count()

In [152]:
lrAccuracyScaledCats

0.6657200811359026

In [153]:
def createCatsLabeledPointN(r):
    trimmed=map(lambda x: x.replace("\"",""),r)
    label=int(trimmed[len(r)-1])
    categoryIdx = categories[r[3]]
    categoryFeatures = [0.0]*numCategories
    categoryFeatures[categoryIdx]=1.0
    return LabeledPoint(label, Vectors.dense(categoryFeatures))

In [154]:
dataNB = records.map(lambda x: createCatsLabeledPointN(x))

In [155]:
nbModelCats = NaiveBayes.train(dataNB)

In [156]:
nbTotalCorrectCats = dataNB.map(lambda x: nbModelCats.predict(x.features)==x.label).collect()

In [158]:
nbAccuracyCats = sum(nbTotalCorrectCats)*1.0 /dataNB.count()

In [159]:
nbAccuracyCats

0.60960108181203521

#Tuning model parameters