In [1]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz
!tar xf spark-3.2.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
import numpy as np
import pandas as pd
import random
import re

In [3]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext(appName="YourTest", master="local[*]")
spark = SparkSession.builder.appName("YourTest").master("local[2]").config('spark.ui.port', random.randrange(4000,5000)).getOrCreate()

In [5]:
# get the raw csv data
file1Name = "Science_data.csv"
file2Name = "arts_data.csv"

df = spark.read.csv(file1Name,inferSchema=True)
df = df.toDF('id','Tweet','time').cache()
df1 = df.rdd.map(lambda x: x[1])

df = spark.read.csv(file2Name,inferSchema=True)
df = df.toDF('id','Tweet','time').cache()
df2 = df.rdd.map(lambda x: x[1])


In [6]:
# df.take(10)

In [7]:
def runLoop(data):
    model = {}
    i = 0
    for row in data:
      i = i+1
      row = row[1:-1].replace("'",'').replace("[",'').replace("]",'').split(', ')
      for j in range(len(row)):
        word = row[j]
        if word in model.keys():
          model[word].append(i)
        else:
          model[word] = [i]

    return model.items()
data1 = df1.coalesce(1)
model1 = data1.mapPartitions(runLoop)
wordMap1 = model1.collect()
data2 = df2.coalesce(1)
model2 = data2.mapPartitions(runLoop)
wordMap2 = model2.collect()

In [8]:
totalRow1 = data1.count()
totalRow2 = data2.count()
print(len(wordMap1))
print(totalRow1)
print(len(wordMap2))
print(totalRow2)

6623
2102
9870
3433


In [9]:
wordDict = {}
totalWords1 = len(wordMap1)

for i in range(totalWords1):
  word = wordMap1[i][0]
  wordDict[word] = i

print(len(wordDict))
totalWords2 = len(wordMap2)
k = totalWords1
for i in range(totalWords2):
  word = wordMap2[i][0]
  if word not in wordDict.keys():
    wordDict[word] = k
    k = k+1

print(len(wordDict))

6623
13729


In [10]:
totalWords = len(wordDict)
trainlength1 = int(totalRow1 * 0.8)
testlength1 = totalRow1 - trainlength1
trainlength2 = int(totalRow2 * 0.8)
testlength2 = totalRow2 - trainlength2
trainArray = np.zeros((trainlength1+trainlength2+1,totalWords))
testArray = np.zeros((testlength1+testlength2,totalWords))
print(trainlength1,trainlength2,testlength1,testlength2)
print(trainArray.shape)

1681 2746 421 687
(4428, 13729)


In [11]:
for i in range(totalWords1):
  for j in range(len(wordMap1[i][1])):
    word = wordDict[wordMap1[i][0]]
    line = wordMap1[i][1][j]
    # print(word,line)
    if(line<trainlength1):
      trainArray[line,word] = 1
    else:
      line = line - trainlength1 
      testArray[line,word] = 1


In [12]:
for i in range(trainlength2):
  for j in range(len(wordMap2[i][1])):
    word = wordDict[wordMap2[i][0]]
    line = wordMap2[i][1][j] + trainlength1
    # print(word,line)
    if(line<trainArray.shape[0]):
      trainArray[line,word] = 1
    else:
      line = line - trainlength1 - trainlength2
      testArray[line,word] = 1 

In [13]:
# initialize the feature probability

featureProb = np.zeros((totalWords,4))
trainData = pd.DataFrame(trainArray)
ones = np.ones(trainlength1+1)
zeros = np.zeros(trainlength2)
onesZeros = np.concatenate((ones,zeros), axis=None)
trainData['label'] = onesZeros
print(trainData.shape)

testData = pd.DataFrame(testArray)
ones = np.ones(testlength1)
zeros = np.zeros(testlength2)
onesZeros = np.concatenate((ones,zeros), axis=None)
testData['label'] = onesZeros
print(testData.shape)

(4428, 13730)
(1108, 13730)


In [14]:
#get feature/word probability
for j in range(totalWords):
    featurej = list(trainData.groupby(j)['label'].apply(list).values)
    
    if(len(featurej) == 2):
        total = len(featurej[1])
        ones = np.sum(featurej[1])     
        zeros = total - ones
        featureProb[j, 0] = (zeros + 1) / (totalRow2 + 2)
        featureProb[j, 1] = (ones + 1) / (totalRow1 + 3)
    else:
        featureProb[j, 0] = 1 / (totalRow2 + 2)
        featureProb[j, 1] = 1 / (totalRow1 + 3)

featureProb[:,2] = 1- featureProb[:,0]
featureProb[:,3] = 1- featureProb[:,1]
logFeatureProb = np.log2(featureProb)

In [15]:
# get training accuracy

inputdata = trainData.iloc[:, 0:totalWords].to_numpy()
inputdataPrime = 1 - trainData.iloc[:, 0:totalWords].to_numpy()
probabzeros = np.multiply(inputdata, np.asmatrix(logFeatureProb[:,0]))
probabones = np.multiply(inputdata, np.asmatrix(logFeatureProb[:,1]))
probabzerosPrime = np.multiply(inputdataPrime, np.asmatrix(logFeatureProb[:,2]))
probabonesPrime = np.multiply(inputdataPrime, np.asmatrix(logFeatureProb[:,3]))

totalWords1 = trainData.shape[0]
resultZero = np.zeros(totalWords1)
resultOne = np.zeros(totalWords1)

for j in range(totalWords1):
    resultZero[j] = np.sum(probabzeros[j]) + np.sum(probabzerosPrime[j])
    resultOne[j] = np.sum(probabones[j]) + np.sum(probabonesPrime[j])
    
outputLabel = (resultZero <= resultOne).astype(int)
trainAccuracy = np.sum(outputLabel == trainData["label"]) / trainData.shape[0] 
print(trainAccuracy)

0.8010388437217706


In [16]:
# get testing accuracy

inputdata = testData.iloc[:, 0:totalWords].to_numpy()
inputdataPrime = 1 - testData.iloc[:, 0:totalWords].to_numpy()
probabzeros = np.multiply(inputdata, np.asmatrix(logFeatureProb[:,0]))
probabones = np.multiply(inputdata, np.asmatrix(logFeatureProb[:,1]))
probabzerosPrime = np.multiply(inputdataPrime, np.asmatrix(logFeatureProb[:,2]))
probabonesPrime = np.multiply(inputdataPrime, np.asmatrix(logFeatureProb[:,3]))

totalWords2 = testData.shape[0]
resultZero = np.zeros(totalWords2)
resultOne = np.zeros(totalWords2)

for j in range(totalWords2):
    resultZero[j] = np.sum(probabzeros[j]) + np.sum(probabzerosPrime[j])
    resultOne[j] = np.sum(probabones[j]) + np.sum(probabonesPrime[j])
    
outputLabel = (resultZero <= resultOne).astype(int)
testAccuracy = np.sum(outputLabel == testData["label"]) / testData.shape[0] 
print(testAccuracy)

0.7265342960288809
