In [46]:
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


In [47]:
# (1) preprocessing - reading the dataset
# Function for reading the dataset file
def read_data(file):
    data = []
    with open(file, 'r') as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

# File name
file = "c:\\Users\Sai raj\Desktop\My Project\dataset.txt"
data = read_data(file)
#print(data[:10])


In [48]:
# (2) preprocessing - removing the stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
stop_words = stopwords.words('english')

# Function for removing the stop word from the text
def remove_stop_words(data):
    sentences = ""
    data = data.split('\n')
    #print(data)
    for text in data:
        # Replacing each special character and numbers with a space
        text_alphanum = re.sub('[^a-z]', ' ', text)
        word_tokens = word_tokenize(text_alphanum)
        
        # Removing stop words
        sentence = ' '.join([w for w in word_tokens if (w not in stop_words)])
        sentences += sentence + "\n"
        #print(sentence)
        
    return sentence




[nltk_data] Downloading package stopwords to C:\Users\Sai
[nltk_data]     raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
# (3) preprocessing - text normalization using lemmatization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# word lemmatization (Normalization)
def noun_lemmatizer(sentences):
    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    sentences = sentences.split('\n')
    #print(sentences)
    lem_text = ''
    for line in sentences:
        #print(line)
        word_tokens = word_tokenize(line)
        sentence = ' '.join([lemmatizer.lemmatize(w, 'n') for w in word_tokens])
        lem_text += sentence + '\n'
    return lem_text

[nltk_data] Downloading package wordnet to C:\Users\Sai
[nltk_data]     raj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
# Function for generating ngrams of words 
def ngram(token, n):
    output = []
    for i in range(n-1, len(token)):
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram)
        #print(output)
    return output

# Function for creating feature
def create_feature(text, nrange=(1,1)):
    text_features = []
    text = text.lower()
    ###################
    #print(text)
    text = remove_stop_words(text)
    #print(text)
    text = noun_lemmatizer(text)
    #print(text)
    ###################
    text_alphanum = text
    #print(text_alphanum)
    for n in range(nrange[0], nrange[1]+1):
        text_features += ngram(text_alphanum.split(), n)
    text_punc = re.sub('[a-z0-9]', ' ', text)
    #print(text_punc)
    text_features += ngram(text_punc.split(), 1)
    #print(Counter(text_features))
    return Counter(text_features)

In [51]:
def convert_label(item, name):
    #print(item)
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)):
        if items[idx] == 1:
            label += name[idx] + " "
    return label.strip()

emotions = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"]

X_all = []
Y_all = []
for label, text in data:
    Y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))


In [52]:
from sklearn import preprocessing
def labels_to_numbers(lables):
    le = preprocessing.LabelEncoder()
    le.fit(s)
    li = le.transform(s)
    return list(li)


from sklearn.metrics import precision_score
#y_true = [4, 2, 0, 5, 1, 6, 3, 4, 2, 0, 5, 1, 6, 3, 4, 2, 0, 5, 1, 6, 3]
#y_pred = [4, 2, 0, 5, 1, 6, 3, 4, 2, 0, 5, 1, 6, 7, 4, 2, 0, 5, 1, 6, 3]
#print(precision_score(y_true, y_pred, average='micro'))

def find_precision(y_true, y_pred):
    y_true = labels_to_numbers(y_true)
    y_pred = labels_to_numbers(y_pred)
    result = precision_score(y_true, y_pred, average='micro')
    return result


#y_true = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt", "sadness", "disgust", "shame", "guilt"]
#y_pred = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt", "sadness", "disgust", "shame", "guilt"]

#print(find_precision(y_true, y_pred))

In [53]:
#(1) Spliting the dataset training-80% | testing-20%
X_train, X_test, y_train, y_test = train_test_split(X_all, Y_all, test_size = 0.2, random_state = 123)

#print(X_test)
#print(y_test[:100])

def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    precision = find_precision(y_test, clf.predict(X_test))
    return train_acc, test_acc, precision



from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
#print(X_test.toarray())

In [54]:
from sklearn.svm import SVC
linear_svm = SVC(kernel='linear')
rbf_svm = SVC(kernel ='rbf', random_state = 0)
poly_svm = SVC(kernel='poly', degree=8)
sigmoid_svm = SVC(kernel ='sigmoid')

clifs = [linear_svm, rbf_svm, poly_svm, sigmoid_svm]
kernals = ["Linear SVM", "Radial basis function", "Polynomial function", "Sigmoid function"]

# train and test them 
print("| {:25} | {} | {} | {} |".format("SVM Kernals", "Training Accuracy", "Test Accuracy", "Precision"))
print("| {} | {} | {} | {} |".format("-"*25, "-"*17, "-"*13, "-"*13))
i=0
for clf in clifs: 
    clf_name = clf.__class__.__name__
    train_acc, test_acc, precision = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.7f} | {:13.7f} | {:13.7f} |".format(kernals[i], train_acc, test_acc, precision))
    i = i+1

| SVM Kernals               | Training Accuracy | Test Accuracy | Precision |
| ------------------------- | ----------------- | ------------- | ------------- |
| Linear SVM                |         0.9953066 |     0.8147700 |     1.0000000 |
| Radial basis function     |         0.9653293 |     0.7935835 |     1.0000000 |
| Polynomial function       |         0.5417108 |     0.3728814 |     1.0000000 |
| Sigmoid function          |         0.9233914 |     0.7475787 |     1.0000000 |


In [13]:
emoji_dict = {"joy":"ðŸ˜‚", "fear":"ðŸ˜±", "anger":"ðŸ˜ ", "sadness":"ðŸ˜¢", "disgust":"ðŸ˜’", "shame":"ðŸ˜³", "guilt":"ðŸ˜³"}
txt1 = "I was thinking about death"
txt2 = "you are awsome"
txt3 = "I crashed my father's car"
txt4 = "i'm very good today"
texts = [txt1, txt2, txt3, txt4]
for text in texts: 
    features = create_feature(text, nrange=(1, 4))
    features = vectorizer.transform(features)
    prediction = clf.predict(features)[0]
    print(prediction)
    print( text,emoji_dict[prediction])

sadness
I was thinking about death ðŸ˜¢
disgust
you are awsome ðŸ˜’
fear
I crashed my father's car ðŸ˜±
joy
i'm very good today ðŸ˜‚


In [39]:
from sklearn import preprocessing
def labels_to_numbers(lables):
    le = preprocessing.LabelEncoder()
    le.fit(s)
    li = le.transform(s)
    return list(li)

In [40]:
from sklearn.metrics import precision_score
#y_true = [4, 2, 0, 5, 1, 6, 3, 4, 2, 0, 5, 1, 6, 3, 4, 2, 0, 5, 1, 6, 3]
#y_pred = [4, 2, 0, 5, 1, 6, 3, 4, 2, 0, 5, 1, 6, 7, 4, 2, 0, 5, 1, 6, 3]
#print(precision_score(y_true, y_pred, average='micro'))

def find_precision(y_true, y_pred):
    y_true = labels_to_numbers(y_true)
    y_pred = labels_to_numbers(y_pred)
    result = precision_score(y_true, y_pred, average='micro')
    return result


#y_true = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt", "sadness", "disgust", "shame", "guilt"]
#y_pred = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt", "sadness", "disgust", "shame", "guilt"]

#print(find_precision(y_true, y_pred))

1.0
