In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
all_tweets = pd.read_csv("data/emojis.csv")

import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

emoji_counts = all_tweets['emoji'].value_counts().head(20)
emoji_counts = emoji_counts.to_frame()
emoji_list = np.array(emoji_counts.index)

df = all_tweets.loc[all_tweets['emoji'].isin(emoji_list)]
pattern = r'(http://[^"\s]+)|(@\w+)|(:)|([^\w\d\s\.\?\!])'

df.loc[:, "text"] = df.text.str.replace(pattern, "").values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [3]:
actual_emoji = df['emoji'].value_counts().index.values
label_emoji_mapping = dict([(label, emoji) for label, emoji in zip(range(20), actual_emoji)])
emoji_label_mapping = dict([(emoji, label) for label, emoji in label_emoji_mapping.items()])

In [4]:
from sklearn.model_selection import train_test_split


train_sentences, test_sentences, train_emojis, test_emojis = train_test_split(df['text'], df['emoji'], 
                                                                              stratify = df['emoji'], random_state = 3011)

# vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

VOCAB_SIZE = 10000 # to prevent MemoryError
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = VOCAB_SIZE)

X_train = vectorizer.fit_transform(train_sentences)
X_test = vectorizer.transform(test_sentences)

print(X_train.shape, X_test.shape)

(321638, 10000) (107213, 10000)


In [6]:
y_train, y_test = train_emojis.replace(emoji_label_mapping), test_emojis.replace(emoji_label_mapping)

In [7]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_test)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [14]:
from sklearn.metrics import classification_report
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear)

Training time: 20969.143530s; Prediction time: 1928.677368s


In [15]:
print(report)

              precision    recall  f1-score   support

           0       0.43      0.90      0.58     31206
           1       0.44      0.56      0.49     10805
           2       0.50      0.52      0.51     10142
           3       0.49      0.24      0.32      8929
           4       0.54      0.25      0.34      5019
           5       0.39      0.06      0.10      4491
           6       0.44      0.12      0.18      4058
           7       0.67      0.47      0.56      3863
           8       0.61      0.11      0.18      3855
           9       0.55      0.08      0.14      3006
          10       0.68      0.19      0.30      2946
          11       0.63      0.17      0.27      2766
          12       0.89      0.14      0.24      2482
          13       0.81      0.45      0.58      2471
          14       0.64      0.05      0.08      2351
          15       0.74      0.20      0.31      1960
          16       0.84      0.03      0.06      1755
          17       0.74    