In [60]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import tensorflow as tf
import csv
import operator
import preprocessor as pre
import regex

## Model Creation

In [2]:
# Load Turkish Fasttext model
fasttextModel = KeyedVectors.load_word2vec_format('data/fasttext/wiki.tr.vec')

In [3]:
# Load Data into a data-frame
# There should be all the account for four different categories
# I removed 10 accounts before putting the directory into Github
# I left 2 sample account in the directory data/trainAccount
retrievedAccounts = ["iuefsosyoloji","sosyolojidivani","sosyolojibolumu", "sosyo_kitap",
                     "setadc","tc_disisleri","yeniturkiye",
                     "iksv_istanbul","issanat", "ntvksanat",
                     "ihhinsaniyardim","diyanetvakfi","turkkizilayi"]

dataFrameList = []
for anAccount in retrievedAccounts:
    dataframe = pd.read_csv("data/trainAccount/" + anAccount + '_cleaned.csv', sep=',', header=None, names=["label","tweets"])
    dataFrameList.append(dataframe)
gatheredDataFrame= pd.concat(dataFrameList).sample(frac=1).reset_index(drop=True)

In [4]:
# Seperate tweets and labels into to data-frames
tweets = gatheredDataFrame['tweets'].values
labels = gatheredDataFrame['label'].values

In [5]:
# Convert labels to binary representations
labelMapper = {"politics":[1,0,0,0], "art":[0,1,0,0], "society":[0,0,1,0], "charity":[0,0,0,1]}
labelsConverted = []
for label in labels:
    labelsConverted.append(labelMapper[label])

In [6]:
# Function to avarage all word-vectors (calculating with provided model) within a sentence
def bagOfWords(sentences, model, vectorLength):
    returnMatrix = []
    for sentence in sentences:
        words = sentence.split()
        wordCount = len(words)
        temporaryList = np.zeros(vectorLength)
        for word in words:
            try:
                temporaryList += model[word]
            except:
                wordCount -= 1
        if wordCount == 0:
            avarageList = np.zeros(vectorLength)
        else:
            avarageList = temporaryList/wordCount
        returnMatrix.append(avarageList)
    return returnMatrix

In [7]:
vectorLength = 300
# Convert tweets into bag-of-words representation
bagOfWordsMatrix = bagOfWords(tweets, fasttextModel, vectorLength)

In [8]:
# Split data as train and test with the ratio 4/5
sperator = int(len(bagOfWordsMatrix)*4/5)
trainFeature = np.array(bagOfWordsMatrix[:sperator])
trainLabel = np.array(labelsConverted[:sperator])
testFeature = np.array(bagOfWordsMatrix[sperator:])
testLabel = np.array(labelsConverted[sperator:])

In [9]:
trainCount = sperator
testCount = len(bagOfWordsMatrix) - len(trainFeature)

In [10]:
# Set Parameters for neural network
learningRate = 0.001
trainingEpochs = 150
batchSize = 128
displayStep = 1
inputLayer = vectorLength
firstHidden = 2 ** 6
secondHidden = 2 ** 7
outputLayer = 2 ** 2

In [11]:
# Tensorflow arrangements
x = tf.placeholder("float", [None, inputLayer])
y = tf.placeholder("float", [None, outputLayer])

# Function of a neural network model
def multilayerPerceptron(x, weights, biases):
    # Hidden layer with RELU activation
    firstLayer = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    firstLayer = tf.nn.relu(firstLayer)
    # Hidden layer with RELU activation
    secondLayer = tf.add(tf.matmul(firstLayer, weights['h2']), biases['b2'])
    secondLayer = tf.nn.relu(secondLayer)
    # Output layer with linear activation
    outputLayer = tf.matmul(secondLayer, weights['out']) + biases['out']
    return outputLayer

In [12]:
# Store weight and biases of each layer
weights = {
    'h1': tf.Variable(tf.random_normal([inputLayer, firstHidden])),
    'h2': tf.Variable(tf.random_normal([firstHidden, secondHidden])),
    'out': tf.Variable(tf.random_normal([secondHidden, outputLayer]))}
biases = {
    'b1': tf.Variable(tf.random_normal([firstHidden])),
    'b2': tf.Variable(tf.random_normal([secondHidden])),
    'out': tf.Variable(tf.random_normal([outputLayer]))}

# Construct the model
prediction = multilayerPerceptron(x, weights, biases)

# Define loss and optimizer function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction))
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(cost)

# Initializing the Variables
initializer = tf.global_variables_initializer()

# Calculate accuracy
correctPredictions = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correctPredictions, "float"))

In [13]:
session = tf.Session()
session.run(initializer)

# Training cycle
for epoch in range(trainingEpochs):
    averageCost = 0.
    totalBatch = int(trainCount/batchSize)

    # Loop over all batches
    batchCounter = 0
    for i in range(totalBatch):
        batchFeature, batchLabel = trainFeature[batchCounter:batchCounter + batchSize],\
                                   trainLabel[batchCounter:batchCounter + batchSize]
        # Run optimization and cost operations
        _, c = session.run([optimizer, cost], feed_dict={x: batchFeature, y: batchLabel})

        # Calculate average cost
        averageCost += c / totalBatch
        batchCounter = batchCounter + batchSize

    if epoch % displayStep == 0:
        print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(averageCost))

print("Optimization Finished!\n")

Epoch: 0001 cost= 48.537809552
Epoch: 0002 cost= 16.171645204
Epoch: 0003 cost= 11.212101183
Epoch: 0004 cost= 8.639849621
Epoch: 0005 cost= 6.996074763
Epoch: 0006 cost= 5.833510590
Epoch: 0007 cost= 4.943300272
Epoch: 0008 cost= 4.242936794
Epoch: 0009 cost= 3.666098441
Epoch: 0010 cost= 3.186154503
Epoch: 0011 cost= 2.782363186
Epoch: 0012 cost= 2.451660262
Epoch: 0013 cost= 2.174097709
Epoch: 0014 cost= 1.936227339
Epoch: 0015 cost= 1.733503498
Epoch: 0016 cost= 1.564142109
Epoch: 0017 cost= 1.419505253
Epoch: 0018 cost= 1.297071832
Epoch: 0019 cost= 1.193733717
Epoch: 0020 cost= 1.104215101
Epoch: 0021 cost= 1.025567705
Epoch: 0022 cost= 0.957372121
Epoch: 0023 cost= 0.898226658
Epoch: 0024 cost= 0.846043421
Epoch: 0025 cost= 0.800914506
Epoch: 0026 cost= 0.762501714
Epoch: 0027 cost= 0.727833680
Epoch: 0028 cost= 0.697354016
Epoch: 0029 cost= 0.669916745
Epoch: 0030 cost= 0.645100099
Epoch: 0031 cost= 0.622358245
Epoch: 0032 cost= 0.601191793
Epoch: 0033 cost= 0.582101635
Epoch: 

In [14]:
# Calculate Accuracy 
correctPredictions = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracyTensor = tf.reduce_mean(tf.cast(correctPredictions, "float"))
probabilityLabels = tf.argmax(prediction, 1)
trainAccuracy, _ = session.run([accuracyTensor, probabilityLabels], feed_dict={x:trainFeature, y:trainLabel})
testAccuracy, _ = session.run([accuracyTensor, probabilityLabels], feed_dict={x:testFeature, y:testLabel})

In [15]:
print("Train Accuracy : %f" % trainAccuracy)
print("Test Accuracy : %f" % testAccuracy)

Train Accuracy : 0.956633
Test Accuracy : 0.803368


## Prediction

In [20]:
def interestFinder(userName):
  
    # Load user tweets and assign a label to each of them
    currentDataframe = pd.read_csv("data/clean/" + userName + '_cleaned.csv', sep=',', header=None, names=["label","tweets"])
    currentTweets = currentDataframe['tweets'].values
    currentBagOfWordsMatrix = bagOfWords(currentTweets, fasttextModel, 300)
    prediction = multilayerPerceptron(x, weights, biases)
    probabilityLabels = tf.argmax(prediction, 1)
    predictedLabels = session.run(probabilityLabels, feed_dict={x:currentBagOfWordsMatrix})
    
    # Calculate frequency of labels
    labelCount = {"politics":0, "art":0, "society":0, "charity":0}
    for prediction in predictedLabels:
        if prediction == 0:
            labelCount['politics'] += 1
        elif prediction == 1:
            labelCount['art'] += 1
        elif prediction == 2:
            labelCount['society'] += 1
        elif prediction == 3:
            labelCount['charity'] += 1
    
    total = float(len(currentDataframe))
    labelFrequency = {key: value / total for key, value in labelCount.iteritems()}
    return labelFrequency

In [32]:
# Open input file to retrieve users from and a file to write topic distribution of each user
inputFile = open('data/graph/node.csv', 'rt')
outputFile = open('data/graph/nodeWithScore.csv', 'wt')
reader = csv.reader(inputFile)
writer = csv.writer(outputFile, delimiter=',', lineterminator='\n')
writer.writerow(['', 'id', 'label', 'politics', 'art', 'society', 'charity', 'interest'])

In [None]:
for line in reader:
    userName = line[2]
    interestDictionary = {"politics":0, "art":0, "society":0, "charity":0}
    mainInterest = "n/a"
    
    if userName != "n/a":
        try: 
            # Load user tweets and assign a label to each of them
            currentDataframe = pd.read_csv("data/clean/" + userName + '_cleaned.csv', sep=',', header=None, names=["label","tweets"])
            currentTweets = currentDataframe['tweets'].values
            currentBagOfWordsMatrix = bagOfWords(currentTweets, fasttextModel, 300)
            predictedLabels = session.run(probabilityLabels, feed_dict={x:currentBagOfWordsMatrix})

            # Calculate frequency of labels
            labelCount = {"politics":0, "art":0, "society":0, "charity":0}
            for prediction in predictedLabels:
                if prediction == 0:
                    labelCount['politics'] += 1
                elif prediction == 1:
                    labelCount['art'] += 1
                elif prediction == 2:
                    labelCount['society'] += 1
                elif prediction == 3:
                    labelCount['charity'] += 1

            total = float(len(currentDataframe))
            interestDictionary = {key: value / total for key, value in labelCount.items()}
            mainInterest = max(interestDictionary.items(), key=operator.itemgetter(1))[0]
            print("Success : %s" % userName)

        except Exception as error:
            print("Error : %s for %s" % (error, userName))

    writer.writerow([line[0], line[1], line[2], interestDictionary['politics'], 
                     interestDictionary['art'], interestDictionary['society'],
                     interestDictionary['charity'], mainInterest])

### Since the actual output is too long, I just included a sample
User : philobuny
User : sabankardas
User : ahmet_ors
User : htcugurist
User : Marmaraaa
User : Donoughtella
User : usamedegirmenci
Error : File b'data/clean/usamedegirmenci_cleaned.csv' does not exist
User : SehirClubs
User : ozhanneslihan
Error : File b'data/clean/ozhanneslihan_cleaned.csv' does not exist
User : bvuslatcelik

In [19]:
inputFile.close()
outputFile.close()

### Writing texts to train LSTM

In [50]:
inputFile = open('data/graph/node.csv', 'rt')
reader = csv.reader(inputFile)

In [None]:
politicsList = list()
artList = list()
dailyList = list()
charityList = list()

for line in reader:
    userName = line[2]
    
    if userName != "n/a":
        try: 
            # Load user tweets and assign a label to each of them
            currentDataframe = pd.read_csv("data/clean/" + userName + '_cleaned.csv', sep=',', header=None, names=["label","tweets"])
            currentTweets = currentDataframe['tweets'].values
            currentBagOfWordsMatrix = bagOfWords(currentTweets, fasttextModel, 300)
            predictedLabels = session.run(probabilityLabels, feed_dict={x:currentBagOfWordsMatrix})

            # Calculate frequency of labels
            labelCount = {"politics":0, "art":0, "society":0, "charity":0}
            for prediction in predictedLabels:
                if prediction == 0:
                    labelCount['politics'] += 1
                elif prediction == 1:
                    labelCount['art'] += 1
                elif prediction == 2:
                    labelCount['society'] += 1
                elif prediction == 3:
                    labelCount['charity'] += 1

            total = float(len(currentDataframe))
            interestDictionary = {key: value / total for key, value in labelCount.items()}
            mainInterest = max(interestDictionary.items(), key=operator.itemgetter(1))[0]
            
            if mainInterest == "politics":
                politicsList.append(int(line[1]))
            elif mainInterest == "art":
                artList.append(int(line[1]))
            elif mainInterest == "society":
                dailyList.append(int(line[1]))
            elif mainInterest == "charity":
                charityList.append(int(line[1]))  
            print("Success : %s" % userName)

        except Exception as error:
            print("Error : %s for %s" % (error, userName))
        
politicsList = set(politicssList)
artList = set(arttList)
dailyList = set(dailyList)
charityList = set(charityList)

### Since the actual output is too long, I just included a sample
Success : htcugurist
Success : Marmaraaa
Success : Donoughtella
Error : File b'data/clean/usamedegirmenci_cleaned.csv' does not exist for usamedegirmenci
Success : SehirClubs
Success : bvuslatcelik
Success : zepaltinbas
Success : mhmmtmz
Success : ssuheyl
Error : File b'data/clean/zeynepkoyuncu__cleaned.csv' does not exist for zeynepkoyuncu_

In [53]:
inputFile.close()

In [54]:
# Load saved userTweetDictionary
userTweetDictionary = np.load('data/dictionary/sehirTweets.npy').item()

In [61]:
def writeTweets(categoryList, dataName):
    # Clean Tweets
    cleanedTweetDictionary = dict()
    pre.set_options(pre.OPT.URL, pre.OPT.EMOJI, pre.OPT.SMILEY, pre.OPT.MENTION)
    for userID in userTweetDictionary:
        if userID in categoryList:
            tweetsList = list()
            for tweet in userTweetDictionary[userID]:
                if tweet[5] == 'tr':
                    tweetsList += [pre.clean(tweet[0])]
            cleanedTweetDictionary[userID] = tweetsList

    originalString = str()
    for userID in cleanedTweetDictionary:
        for tweet in cleanedTweetDictionary[userID]:
            currentString = str()
            for word in tweet.split():
                word = regex.sub(u'[^\p{Latin}]', u'', word)
                currentString += word
                currentString += " "
            originalString += currentString.strip("RT ").lower() + ".\n"
    
    textFile = open("data/text/" + dataName + "/input.txt", "wt")
    textFile.write(originalString)
    textFile.close()

In [62]:
writeTweets(politicsList, "politics")
writeTweets(artList, "art")
writeTweets(dailyList, "daily")
writeTweets(charityList, "charity")