In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
all_tweets = pd.read_csv("data/emojis.csv")

import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

emoji_counts = all_tweets['emoji'].value_counts().head(20)
emoji_counts = emoji_counts.to_frame()
emoji_list = np.array(emoji_counts.index)

df = all_tweets.loc[all_tweets['emoji'].isin(emoji_list)]
pattern = r'(http://[^"\s]+)|(@\w+)|(:)|([^\w\d\s\.\?\!])'

df.loc[:, "text"] = df.text.str.replace(pattern, "").values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [3]:
actual_emoji = df['emoji'].value_counts().index.values
label_emoji_mapping = dict([(label, emoji) for label, emoji in zip(range(20), actual_emoji)])
emoji_label_mapping = dict([(emoji, label) for label, emoji in label_emoji_mapping.items()])

In [4]:
from sklearn.model_selection import train_test_split


train_sentences, test_sentences, train_emojis, test_emojis = train_test_split(df['text'], df['emoji'], 
                                                                              stratify = df['emoji'], random_state = 3011)

# vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

VOCAB_SIZE = 10000 # to prevent MemoryError
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = VOCAB_SIZE)

X_train = vectorizer.fit_transform(train_sentences)
X_test = vectorizer.transform(test_sentences)

print(X_train.shape, X_test.shape)

(321638, 10000) (107213, 10000)


# splitting

In [6]:
y_train, y_test = train_emojis.replace(emoji_label_mapping), test_emojis.replace(emoji_label_mapping)

# SVM model

In [7]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

In [8]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_test)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1



# Accuracy report

In [9]:
from sklearn.metrics import classification_report
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear)

Training time: 10556.053780s; Prediction time: 2417.508622s


  'precision', 'predicted', average, warn_for)


In [10]:
print(report)

              precision    recall  f1-score   support

           0       0.29      1.00      0.45     31206
           1       0.00      0.00      0.00     10805
           2       0.00      0.00      0.00     10142
           3       0.00      0.00      0.00      8929
           4       0.00      0.00      0.00      5019
           5       0.00      0.00      0.00      4491
           6       0.00      0.00      0.00      4058
           7       0.00      0.00      0.00      3863
           8       0.00      0.00      0.00      3855
           9       0.00      0.00      0.00      3006
          10       0.00      0.00      0.00      2946
          11       0.00      0.00      0.00      2766
          12       0.00      0.00      0.00      2482
          13       0.00      0.00      0.00      2471
          14       0.00      0.00      0.00      2351
          15       0.00      0.00      0.00      1960
          16       0.00      0.00      0.00      1755
          17       0.00    

In [24]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [21]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [22]:
y_pred = clf.predict(X_test)

In [25]:
print("Accuracy:",accuracy_score(y_test, y_pred))

Accuracy: 0.4738604460280003


In [31]:
a = vectorizer.fit_transform(["I'm very happy today"])
print(clf.predict(a))

ValueError: Number of features of the model must match the input. Model n_features is 10000 and input n_features is 2 

In [27]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [28]:
xpred = text_classifier.predict(X_test)

In [29]:
print("Accuracy:",accuracy_score(y_test, xpred))

Accuracy: 0.5181554475669928
