In [1]:
import json
import requests
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification
import numpy as np
import fasttext.util
from transformers import logging
import pickle
import math
from sklearn.preprocessing import normalize

## Functions

### Read data from evrey candidate and create new Dataframe with equal amount of true negative and fales negative

In [2]:
def create_data():
    #List Of Evrey candidates of the file with all the data
    candidates = [
        ('Ashdod','Eli Lachmani.csv'),
        ('Ashdod','Helen Gelber.csv'),
        ('Ashdod','Shimon Keznelson.csv'),
        ('Ashdod','Yechiel Lasry.csv'),
        ('Ashkelon','Itai Sahar.csv'),
        ('Ashkelon','Itamar Shimoni.csv'),
        ('Ashkelon','Tomer Glam.csv'),
        ('Beer Sheva','efraim_painbloom.csv'),
        ('Dimona','Nisim Perez.csv'),
        ('Haifa','Israel Savion.csv'),
        ('Herzliya','Eyal Fabian.csv'),
        ('Herzliya','Maya Katz.csv'),
        ('Hod Hasharon','Amir_Kochavi.csv'),
        ('Hod Hasharon','Ifat_Kariv.csv'),
        ('Jerusalem','Ofer_Berkovitch.csv'),
        ('Jerusalem','Yossi_Daitsh.csv'),
        ('Jerusalem','zeev elkin.csv'),
        ('Netanya','Efraim_Bulmash.csv'),
        ('Netanya','Herzel Keren.csv'),
        ('Netanya','Ofer Orenshtein.csv'),
        ('Netanya','Yoni Chetboun.csv'),
        ('Petah Tikva','genadi_borshavski.csv'),
        ('Petah Tikva','itzik_braverman.csv'),
        ('Petah Tikva','Rami_Greenberg.csv'),
        ('Petah Tikva','shonshine_itai.csv'),
        ('Rishon LeZion','Raz_Kinstlich.csv'),
        ('Tel Aviv','Ron Huldai.csv')
    ]
    
    #Iterate Over All The Candidates And Insert Data To DataFrame Using Panda
    data = pd.DataFrame()
    for city,candidate in candidates:
        path = f'Data/{city}/{candidate}'
        df = pd.read_csv(path, encoding='utf8')
        data = data.append(df)
        print(f'{city}-{candidate}-- DONE')
     
    #Create new DataFrame Conatin Equal Amount Of Each Class
    election_data = data[data['Elections'] == True]
    negative = election_data[election_data['negative'] == True]
    negative = negative.sample(frac=1).reset_index(drop=True)
    non_negative = election_data[election_data['negative'] == False]
    non_negative = non_negative.sample(frac=1).reset_index(drop=True)
    non_negative = non_negative[0:540]
    balance_data = pd.DataFrame()
    balance_data = balance_data.append(negative, ignore_index=True)
    balance_data = balance_data.append(non_negative, ignore_index=True)
    return balance_data

### Initializing Fasttext model

In [1]:
def init_FastText():   
    '''
        Returns:
            fasttext: Fasttext Model
    '''
    #Word2Vec Hebrew Model
    fasttext.util.download_model('he', if_exists='ignore')  # Hebrew
    
    ft = fasttext.load_model('cc.he.300.bin')
    #fasttext.util.reduce_model(ft, 100)
    print("----FastText DONE----")
    return ft

### Create vectors list for tokens list

In [4]:
def sentenc_onlp_bert(text):
    """ Remove punctuation,lattine letters and numbers from the text.
        Using yap parser(Morphological and Syntactic Analysis) to get tokens from the text.
        Args:
            text (string): Text/Post 
            
        Returns:
            list of string: Tokens list
    """
    #List Of Punctuation
    punctuation = ['.','"','!','?','[',']','-','+','\\',"/",">","<",';',':','–','(',')','=']
    
    #Remove Punctuation From The Text
    for p in punctuation:
        text = text.replace(p,'')

    #Remove Lattine Letters And Numbers
    text = re.sub(r'[a-zA-Z0-9]+', '', text)  
    
    #Connect to Onlp Local Server And Run On The Text
    localhost_yap = "http://localhost:8000/yap/heb/joint"
    data = '{{"text": "{}  "}}'.format(text).encode('utf-8')  # input string ends with two space characters
    headers = {'content-type': 'application/json'}
    response = requests.get(url=localhost_yap, data=data, headers=headers)
    json_response = response.json()
    #Split Text To Evrey Word
    alist = json_response['md_lattice'].split('\n')
    #Make New List Of Only NN and VB
    newlist = []
    tokens = []
    for item in alist:
        item = item.split('\t')
        if item[0] != '':
            item[3] = item[3].replace('.','')
            item[3] = item[3].replace(':','')
            if item[5] == 'NN' or item[5] == 'VB':
                tokens.append(item[3])
    return tokens

In [3]:
def to_vector(tokens):
    """ For evrey token in the list create vector.
    Args:
        tokens (list of string): Text 
        
    Returns:
        list of NumPy: vectors list
    """

    arr = []
    for word in tokens:
        temp = ft.get_word_vector(word)
        arr.append(temp)
    print('----Sentence Vector Done----')
    return arr

In [6]:
def create_token_list(text):
    """ Split the post to sentences and run yap on evrey sentence.
        For too long sentences yap might freeze.
    Args:
        text (string): Text/Post 
        
    Returns:
        list of string: Tokens list
    """
    
    #Create List Of Tokens
    tokens = []
    text = str(text)
    #split to sentences.
    for item in text.split('.'):
        #if sentence is stiil too long split with ','.
        if len(item) > 250: 
            for item2 in item.split(','):
                temp = sentenc_onlp_bert(item2)
                for token in temp:
                    tokens.append(token)           
        else:
            temp = sentenc_onlp_bert(item)
            for token in temp:
                tokens.append(token)
    print('----Create Tokens Done----')
    return tokens

### Bert assessment

In [7]:
def bert_sentiment_assessment(text):
    """ Remove punctuation,lattine letters and numbers from the text.
        Then assessment the text for sentiment.
        
    Args:
        text (string): Text/Post 
        
    Returns:
        Numpy: [neutral , positive , negative]
    """  
    
    #Create Bert Pipeline
    sentiment_analysis = pipeline(
        "sentiment-analysis",
        model="heBERT_sentiment_analysis",
        tokenizer="heBERT_sentiment_analysis",
        return_all_scores = True
    )

    
    #List Of Punctuation
    punctuation = ['"','!','?','[',']','-','+','\\',"/",">","<",';',':','–','(',')','=']
    
    #Remove Punctuation From The Text
    for p in punctuation:
        text = text.replace(p,'')

    #Remove Lattine Letters And Numbers
    text = re.sub(r'[a-zA-Z0-9]+', '', text)  
    
    #Split Text Because Bert Max Word Count is 512
    if(len(text) > 512):
        count = 0
        neutral = 0
        positive = 0
        negative = 0
        for item in text.split('.'):
            count+=1
            vector = sentiment_analysis(item)
            neutral += vector[0][0]['score']
            positive += vector[0][1]['score']
            negative += vector[0][2]['score']
        neutral /= count
        positive /= count
        negative /= count
        print("----Bert Assessment Done----")
        return [neutral,positive,negative]
                
    sentiment = sentiment_analysis(text)
    #sentiment_arr - neutral , positive , negative
    sentiment_arr = [item['score'] for item in sentiment[0]]
    print("----Bert Assessment Done----")
    return sentiment_arr

## Preprocessing

### Creating Dataset

In [8]:
#Read Data And Panda Dataframe
election_data = create_data()

Ashdod-Eli Lachmani.csv-- DONE
Ashdod-Helen Gelber.csv-- DONE
Ashdod-Shimon Keznelson.csv-- DONE
Ashdod-Yechiel Lasry.csv-- DONE
Ashkelon-Itai Sahar.csv-- DONE
Ashkelon-Itamar Shimoni.csv-- DONE
Ashkelon-Tomer Glam.csv-- DONE
Beer Sheva-efraim_painbloom.csv-- DONE
Dimona-Nisim Perez.csv-- DONE
Haifa-Israel Savion.csv-- DONE
Herzliya-Eyal Fabian.csv-- DONE
Herzliya-Maya Katz.csv-- DONE
Hod Hasharon-Amir_Kochavi.csv-- DONE
Hod Hasharon-Ifat_Kariv.csv-- DONE
Jerusalem-Ofer_Berkovitch.csv-- DONE
Jerusalem-Yossi_Daitsh.csv-- DONE
Jerusalem-zeev elkin.csv-- DONE
Netanya-Efraim_Bulmash.csv-- DONE
Netanya-Herzel Keren.csv-- DONE
Netanya-Ofer Orenshtein.csv-- DONE
Netanya-Yoni Chetboun.csv-- DONE
Petah Tikva-genadi_borshavski.csv-- DONE
Petah Tikva-itzik_braverman.csv-- DONE
Petah Tikva-Rami_Greenberg.csv-- DONE
Petah Tikva-shonshine_itai.csv-- DONE
Rishon LeZion-Raz_Kinstlich.csv-- DONE
Tel Aviv-Ron Huldai.csv-- DONE


In [16]:
election_data

Unnamed: 0,name,time,post,year,location,party,labor,geographical_area,negative,target,Elections,Tokens,bert,Vectors
0,Rami_Greenberg,14/05/2018 23:13,בזמן שאני במילואים משרת את המדינה אני מקבל עדכ...,2018.0,Petah Tikva,הליכוד,Right,Center,True,,True,"[זמן, במילואים, מדינה, עדכון, שטח, עבריין, נשל...","[0.16135516462782107, 0.17096072217797578, 0.6...","[[6.93087868923982e-310, 6.93087868923982e-310..."
1,Itai_Sahar,17/07/2018 21:31,ראיון ברשת ב' לגבי הזנחת מיגון השכונות הוותיקו...,2018.0,Ashkelon,"תקווה חדשה,מופת",Right,South,True,,True,"[ראיון, ב', שכונה, התחממות, שבת, הפתיע, מוקד, ...","[0.12134942523386728, 0.21913829266070803, 0.6...","[[6.93087868924694e-310, 6.93087868924694e-310..."
2,Itai_Sahar,25/09/2018 13:51,החיבור בין הנהגה עירונית להנהגה רוחנית באשקלון...,2018.0,Ashkelon,"תקווה חדשה,מופת",Right,South,True,,True,"[חיבור, הנהגה, הנהגה, גבול, תמיכה, תמיכה, דעת,...","[0.060134747054776415, 0.21581373673541834, 0....","[[6.9308786892604e-310, 6.9308786892604e-310, ..."
3,Efraim Bulmash,17/09/2018 19:44,"אם אני הייתי ראש עיריית קריית גת, כבוד הרב היה...",2018.0,Netanya,העבודה,Left,Center,True,,True,"[רב, רב, בלין,, התנצל, התפלל, כיפור, סליחה, עד...","[5.125870666233823e-05, 6.046074122423306e-05,...","[[6.9308786892185e-310, 6.9308786892185e-310, ..."
4,Efraim Bulmash,26/06/2018 08:19,קאט דה בולשיט שאנז אליזה ברחוב הרצל! קודם תנ...,2018.0,Netanya,העבודה,Left,Center,True,,True,"[רחוב, תנגישו, עיר, מערך, ייצר, חנייה, כלל, חנ...","[5.993927698000334e-05, 4.033872755826451e-05,...","[[6.93087868922006e-310, 6.93087868922006e-310..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,Shimon Keznelson,28/06/2018 14:53,"מוטי אלבז היקר תודה רבה על האירוח הלבבי, תודה ...",2018.0,Ashdod,אשדוד ביחד,,South,False,,True,"[תודה, אירוח, הודה, תושב, עיר, הגיע, הביע, תמי...","[0.0001437089085811749, 0.9997618794441223, 9....","[[-0.00192866544239223, 0.17145498096942902, 0..."
1076,Itzik Braverman,21/06/2018 09:46,הודעתי אתמול על הקמת יחידה לפיקוח על הגנים הפר...,2018.0,Petah Tikva,"אכפ""ת,העבודה",Left,Center,False,,True,"[הודיע, יחידה, פיקוח, גן, ערב, התקיים, מפגש, פ...","[0.10708105028121888, 0.8066147193312645, 0.08...","[[-0.00192866544239223, 0.17145498096942902, 0..."
1077,Raz Kinstlich,13/08/2018 08:45,"מעין פלח שדה, 50, בעלת תואר שני במנהל עסקים. ...",2018.0,Rishon LeZion,תקווה חדשה,Right,Center,False,,True,"[תואר, מנהל, עסק, רכש, מפעל, הייטק, דירקטור, מ...","[0.31750033317366616, 0.676763616874814, 0.005...","[[-0.00192866544239223, 0.17145498096942902, 0..."
1078,Ifat Kariv,28/10/2018 19:05,לא מבזבזים את הקול! לתיבות הדואר שלכם הגיעה ...,2018.0,Hod Hasharon,,,Center,False,,True,"[קול, דואר, הגיע, חובר, מידע, בחירה, מידע, תוכ...","[0.161508748215662, 0.3438450593712332, 0.4946...","[[-0.00192866544239223, 0.17145498096942902, 0..."


### For evrey post create token list

In [12]:
election_data['Tokens'] = [create_token_list(item) for item in election_data['post']]

----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
----Create Tokens Done----
-

### For evrey post estimate bert sentiment

In [14]:
election_data['bert'] = [bert_sentiment_assessment(item) for item in election_data['post']]

----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Assessment Done----
----Bert Asses

### For evrey post create vector list from the tokens list

In [4]:
ft = init_FastText()



----FastText DONE----


In [12]:
election_data['Vectors'] = [to_vector(item) for item in election_data['Tokens']]

----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence Vector Done----
----Sentence V

### For evrey post calculate the average of the vectors and normalize the result to get single vector to represent the post 

In [15]:
election_data['vector'] = [normalize(np.mean(item, axis=0)[:,np.newaxis], axis=0).ravel() for item in election_data['Vectors']]

### For evrey post concatenate the post vector with bert vector

In [16]:
election_data['vector_bert'] = [normalize(np.concatenate([item[1]['vector'],item[1]['bert']],axis=None)[:,np.newaxis], axis=0).ravel() for item in data.iterrows()]

### For evrey post create tf-idf vector

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform([' '.join(item) for item in election_data['Tokens']])

In [121]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), 
                  index=tfidf_vectorizer.get_feature_names(), 
                  columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
פחדן,0.357741
מאס,0.357741
עבריין,0.262197
שלט,0.235733
פתח,0.223166
...,...
זנה,0.000000
זמר,0.000000
זלזל,0.000000
זלזול,0.000000


In [124]:
X = vectorizer.fit_transform(election_data['post'])

In [134]:
X.toarray().shape

(1073, 22896)

In [135]:
election_data['tfidf'] = [item for item in X.toarray()]

## Final Dataframe and save

In [None]:
election_data

In [None]:
#Save Data to file
with open('Data.pkl', 'wb') as f:
    pickle.dump(election_data, f)