In [1]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 16 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
import csv
from google.colab import drive
import pandas as pd
import re
import spacy
from nltk.stem import WordNetLemmatizer
import nltk
import numpy as np

In [4]:
nlp = spacy.load("en_core_web_lg", exclude=["tagger", "parser", "senter", "attribute_ruler", "lemmatizer", "ner"])
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
datapath = '/content/drive/MyDrive/NLP_Workspace/Data/'

with open(datapath + 'emoji_sentiment.csv','r') as file:     ### file path to be changed
  csvreader = csv.reader(file)
  emoji = np.array(list(csvreader))

with open(datapath + 'emoticons.csv','r') as file:
  csvreader = csv.reader(file)
  emoticon = np.array(list(csvreader))

In [6]:
print(emoji[0])
print(emoticon[0])

['Emoji' 'Unicode codepoint' 'Occurrences' 'Position' 'Negative' 'Neutral'
 'Positive' 'Unicode name' 'Unicode block']
['Emoticon' 'Similar_Emoji' 'Description']


In [7]:
with open(datapath + 'balanced_data.csv','r') as file:
  csvreader = csv.reader(file)
  data = list(csvreader)

In [8]:
emoji = emoji[1:]
emoticon = emoticon[1:]

In [9]:
data[:10]

[['type', 'text', 'word_num', 'label'],
 ['ENFJ',
  "I went through a break up some months ago. We were together for 4 years and I had planned my life around that relationship. I wasn't the one breaking the relationship as you might imagine and all our...",
  '38',
  '0'],
 ['ENFJ', 'ENFJ Puns so many puns.', '5', '0'],
 ['ENFJ',
  "Well I personally don't go that much for attractiveness in general but I can see you have the will to change that and that's good already. May I ask if you want to be with them in a merely sexual...",
  '40',
  '0'],
 ['ENFJ',
  "Sorry, not an INFP but I'm really into post-rock so I had to post :<  Going to leave this here and sneak out D:",
  '24',
  '0'],
 ['ENFJ', 'Welcome!!!', '1', '0'],
 ['ENFJ', 'wrong thread D:', '3', '0'],
 ['ENFJ',
  "That doesn't sound very ENFJ I think x3 I'd never act cold towards a romantic interest. Those just get my warm side I think. When I like someone that way all I want to do is to just straight to them...",
  '41',
  '0'

In [10]:
text = np.array(data)[1:, 1]

In [11]:
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
def lemmatized(str):
    return [lemmatizer.lemmatize(token) for token in str.split()]


results = {}


# from name of found emoji, query emoji database to find its underlying positivity
def get_pos(name):
    mx = 0
    mx_pos = 0
    cur = 0
    mem = ''
    override = [0,0,0]
    if name in results:
      print("%s found in memory" % (name))
      return results.get(name)
    
    cos_name = nlp(" ".join(lemmatized(name.lower())))
    for i in emoji:
        query = lemmatized(i[7].lower())
        querystr = " ".join(query)
        cur = nlp(querystr).similarity(cos_name)
        if name in query and cur > override[1]:
            override = [querystr, cur, int(i[6]) / (int(i[4]) + int(i[5]) + int(i[6]))]
            continue
        if mx < cur:
            mem = querystr
            mx = cur
            mx_pos = int(i[6]) / (int(i[4]) + int(i[5]) + int(i[6]))
    
    if override[1] > 0:
        print("most similar word to %s is %s with sim of %f, pos of %f." % (name, mem, mx, mx_pos))
        print("overriden to literal match at %s is %s with sim of %f, pos of %f." % (name, override[0], override[1], override[2]))
        results[name] = override[2]
        return mx_pos
    else:
        print("most similar word to %s is %s with sim of %f, pos of %f." % (name, mem, mx, mx_pos))
        results[name] = mx_pos
        return mx_pos


def emoji_featurize(data):
    count = 0
    start = 0
    pos = 0
    for m in re.finditer(':[a-zA-z\d_]+:', data):
        count += 1
        start += m.start()
        pos += get_pos(data[m.start()+1:m.end()-1])
    
    datalist = data.split()
    for i in emoticon:
        e = i[0]
        if len(e) < 1:
            continue
        for j in range(len(datalist)):
            if e == data[j]:
                count += 1
                start += data.index(data[j])
                pos += get_pos(i[2])


    if count == 0:
      return [0,0,0]

    return [count, start/count/len(data), pos/count]


def emoji_replace(data):
    data = re.sub(':[a-zA-z\d_]+:', '<emoji>', data)
    return data


In [13]:
emoji_feature = pd.DataFrame([], columns=['300', '301', '302'])
emoji_feature.head(0)

Unnamed: 0,300,301,302


In [14]:
for i in range(len(text)):
  emoji_feature.loc[i] = emoji_featurize(text[i])
  print(emoji_feature.loc[i])


300    0
301    0
302    0
Name: 0, dtype: object
300    0
301    0
302    0
Name: 1, dtype: object
300    0
301    0
302    0
Name: 2, dtype: object
300    0
301    0
302    0
Name: 3, dtype: object
300    0
301    0
302    0
Name: 4, dtype: object
300    0
301    0
302    0
Name: 5, dtype: object
300    0
301    0
302    0
Name: 6, dtype: object
300    0
301    0
302    0
Name: 7, dtype: object
300    0
301    0
302    0
Name: 8, dtype: object
300    0
301    0
302    0
Name: 9, dtype: object
300    0
301    0
302    0
Name: 10, dtype: object
300    0
301    0
302    0
Name: 11, dtype: object
300    0
301    0
302    0
Name: 12, dtype: object
300    0
301    0
302    0
Name: 13, dtype: object
300    0
301    0
302    0
Name: 14, dtype: object
300    0
301    0
302    0
Name: 15, dtype: object
300    0
301    0
302    0
Name: 16, dtype: object
300    0
301    0
302    0
Name: 17, dtype: object
300    0
301    0
302    0
Name: 18, dtype: object
300    0
301    0
302    0
Name: 19, dtyp

  cur = nlp(querystr).similarity(cos_name)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
300    0
301    0
302    0
Name: 199016, dtype: object
300    0
301    0
302    0
Name: 199017, dtype: object
300    0
301    0
302    0
Name: 199018, dtype: object
300    0
301    0
302    0
Name: 199019, dtype: object
300    0
301    0
302    0
Name: 199020, dtype: object
300    0
301    0
302    0
Name: 199021, dtype: object
300    0
301    0
302    0
Name: 199022, dtype: object
300    0
301    0
302    0
Name: 199023, dtype: object
300    0
301    0
302    0
Name: 199024, dtype: object
300    0
301    0
302    0
Name: 199025, dtype: object
300    0
301    0
302    0
Name: 199026, dtype: object
300    0
301    0
302    0
Name: 199027, dtype: object
300    0
301    0
302    0
Name: 199028, dtype: object
300    0
301    0
302    0
Name: 199029, dtype: object
300    0
301    0
302    0
Name: 199030, dtype: object
300    0
301    0
302    0
Name: 199031, dtype: object
300    0
301    0
302    0
Name: 199032, dtype: object


In [16]:
len(data)

200255

In [18]:
data[0]

['type', 'text', 'word_num', 'label']

In [21]:
df_data = pd.DataFrame(data[1:], columns=data[0])

In [22]:
df_data

Unnamed: 0,type,text,word_num,label
0,ENFJ,I went through a break up some months ago. We ...,38,0
1,ENFJ,ENFJ Puns so many puns.,5,0
2,ENFJ,Well I personally don't go that much for attra...,40,0
3,ENFJ,"Sorry, not an INFP but I'm really into post-ro...",24,0
4,ENFJ,Welcome!!!,1,0
...,...,...,...,...
200249,ISTP,I definitely drive manual cars more aggressive...,35,15
200250,ISTP,My name is doudeman (dudeman). I found out abo...,36,15
200251,ISTP,Well is there anyone out there that would work...,40,15
200252,ISTP,Stop masturbating sure is one unspoken thing t...,36,15


In [23]:
emoji_feature

Unnamed: 0,300,301,302
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
200249,0,0,0
200250,0,0,0
200251,0,0,0
200252,0,0,0


In [25]:
processed = pd.concat([df_data, emoji_feature], axis=1)
processed

Unnamed: 0,type,text,word_num,label,300,301,302
0,ENFJ,I went through a break up some months ago. We ...,38,0,0,0,0
1,ENFJ,ENFJ Puns so many puns.,5,0,0,0,0
2,ENFJ,Well I personally don't go that much for attra...,40,0,0,0,0
3,ENFJ,"Sorry, not an INFP but I'm really into post-ro...",24,0,0,0,0
4,ENFJ,Welcome!!!,1,0,0,0,0
...,...,...,...,...,...,...,...
200249,ISTP,I definitely drive manual cars more aggressive...,35,15,0,0,0
200250,ISTP,My name is doudeman (dudeman). I found out abo...,36,15,0,0,0
200251,ISTP,Well is there anyone out there that would work...,40,15,0,0,0
200252,ISTP,Stop masturbating sure is one unspoken thing t...,36,15,0,0,0


In [26]:
processed.to_csv(datapath+"emoji_detected.csv", index = False)

In [15]:
emoji_feature.to_csv(datapath+'SVM_Binary_Labeled/16class_SVM.csv', index=False)

In [None]:
results.to_csv(datapath+'SVM_data/emojidict.csv', index=False)

AttributeError: ignored