In [1]:
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [2]:
# (1) preprocessing - reading the dataset
# Function for reading the dataset file
def read_data(file):
    data = []
    with open(file, 'r') as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

# File name
file = "c:\\Users\Sai raj\Desktop\My Project\dataprep.txt"
data = read_data(file)

print(f"Number of instances: {len(data)}")

Number of instances: 8706


In [3]:
# (2) preprocessing - removing the stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
stop_words = stopwords.words('english')

# Function for removing the stop word from the text
def remove_stop_words(data):
    sentences = ""
    data = data.split('\n')
    #print(data)
    for text in data:
        # Replacing each special character and numbers with a space
        text_alphanum = re.sub('[^a-z]', ' ', text)
        word_tokens = word_tokenize(text_alphanum)
        
        # Removing stop words
        sentence = ' '.join([w for w in word_tokens if (w not in stop_words)])
        sentences += sentence + "\n"
        #print(sentence)
        
    return sentence




[nltk_data] Downloading package stopwords to C:\Users\Sai
[nltk_data]     raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# (3) preprocessing - text normalization using lemmatization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# word lemmatization (Normalization)
def noun_lemmatizer(sentences):
    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    sentences = sentences.split('\n')
    #print(sentences)
    lem_text = ''
    for line in sentences:
        #print(line)
        word_tokens = word_tokenize(line)
        sentence = ' '.join([lemmatizer.lemmatize(w, 'v') for w in word_tokens])
        lem_text += sentence + '\n'
    return lem_text

[nltk_data] Downloading package wordnet to C:\Users\Sai
[nltk_data]     raj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Function for generating ngrams of words 
def ngram(token, n):
    output = []
    for i in range(n-1, len(token)):
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram)
        #print(output)
    return output

# Function for creating feature
def create_feature(text, nrange=(1,1)):
    text_features = []
    text = text.lower()
    ###################
    #print(text)
    text = remove_stop_words(text)
    #print(text)
    text = noun_lemmatizer(text)
    #print(text)
    ###################
    text_alphanum = text
    #print(text_alphanum)
    for n in range(nrange[0], nrange[1]+1):
        text_features += ngram(text_alphanum.split(), n)
    text_punc = re.sub('[a-z0-9]', ' ', text)
    #print(text_punc)
    text_features += ngram(text_punc.split(), 1)
    #print(Counter(text_features))
    return Counter(text_features)

In [6]:
def convert_label(item, name):
    #print(item)
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)):
        if items[idx] == 1:
            label += name[idx] + " "
    return label.strip()

emotions = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"]

X_all = []
Y_all = []
for label, text in data:
    Y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_all, Y_all, test_size = 0.2, random_state = 123)

def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
#print(y_train)

In [14]:
from sklearn.svm import SVC
linear_svm = SVC(kernel='linear')
rbf_svm = SVC(kernel ='rbf', random_state = 0)
poly_svm = SVC(kernel='poly', degree=8)
sigmoid_svm = SVC(kernel ='sigmoid')

clifs = [linear_svm, rbf_svm, poly_svm, sigmoid_svm]

# train and test them 
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs: 
    clf_name = clf
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} SVM | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |


TypeError: unsupported format string passed to SVC.__format__

In [10]:
l = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]
l.sort()
label_freq = {}
for label, _ in data: 
    label_freq[label] = label_freq.get(label, 0) + 1

# print the labels and their counts in sorted order 
for l in sorted(label_freq, key=label_freq.get, reverse=True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

joy       (1. 0. 0. 0. 0. 0. 0.)  1084
anger     (0. 0. 1. 0. 0. 0. 0.)  1080
sadness   (0. 0. 0. 1. 0. 0. 0.)  1079
fear      (0. 1. 0. 0. 0. 0. 0.)  1078
disgust   (0. 0. 0. 0. 1. 0. 0.)  1057
guilt     (0. 0. 0. 0. 0. 0. 1.)  1057
shame     (0. 0. 0. 0. 0. 1. 0.)  1045


In [11]:
emoji_dict = {"joy":"😂", "fear":"😱", "anger":"😠", "sadness":"😢", "disgust":"😒", "shame":"😳", "guilt":"😳"}
t1 = "This love you so much"
t2 = "I can't speak english becuase I'm poor in english"
t3 = "you are awsome"
t4 = "I love you anymore..!"

texts = [t1, t2, t3, t4]
for text in texts: 
    features = create_feature(text, nrange=(1, 4))
    features = vectorizer.transform(features)
    prediction = clf.predict(features)[0]
    print( text,emoji_dict[prediction])

love much

Counter({'love': 1, 'much': 1, 'love much': 1})
This love you so much 😂
speak english becuase poor english

Counter({'english': 2, 'speak': 1, 'becuase': 1, 'poor': 1, 'speak english': 1, 'english becuase': 1, 'becuase poor': 1, 'poor english': 1, 'speak english becuase': 1, 'english becuase poor': 1, 'becuase poor english': 1, 'speak english becuase poor': 1, 'english becuase poor english': 1})
I can't speak english becuase I'm poor in english 😢
awsome

Counter({'awsome': 1})
you are awsome 😳
love anymore

Counter({'love': 1, 'anymore': 1, 'love anymore': 1})
I love you anymore..! 😂
