In [1]:
import pandas as pd

In [2]:
csv_file = pd.read_csv("~/Downloads/labeled_data.csv", index_col=0)

In [3]:
csv_file.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [44]:
pre_data = csv_file[["class", "tweet"]]
pre_data["class"] = pre_data['class'].map({0: 0, 1: 0, 2:1})
pre_data.head()
len(pre_data[(pre_data['class'] == 1)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


4163

In [45]:
len(pre_data[(pre_data['class'] == 0)])

20620

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['tweet'], data['class'], test_size=0.3, random_state=1)
x_train.shape

(17348,)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(csv_file['tweet'])
X_train_counts.shape

(24783, 35852)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(24783, 35852)

In [8]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, csv_file['class'])

In [9]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(x_train, y_train)

In [10]:
import numpy as np
predicted = text_clf.predict(x_test)
np.mean(predicted == y_test)

0.7891055817081372

In [11]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=10, random_state=42))])

text_clf_svm = text_clf_svm.fit(x_train, y_train)
predicted_svm = text_clf_svm.predict(x_test)
np.mean(predicted_svm == y_test)

0.7901815736381977

In [12]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [13]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=5)
gs_clf = gs_clf.fit(x_train, y_train)

In [14]:
# To see the best mean score and the params, run the following code

gs_clf.best_params_
gs_clf.best_score_

# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

0.8519137652755361

In [16]:
predicted_gs_clf = gs_clf.predict(x_test)

np.mean(predicted_gs_clf == y_test)

0.8555480833893746

In [19]:
# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1, cv=5)
gs_clf_svm = gs_clf_svm.fit(x_train, y_train)


gs_clf_svm.best_score_
gs_clf_svm.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}

In [20]:
gs_clf_svm.best_score_

0.8542195065713627

In [21]:
predicted_gs_clf_svm = gs_clf_svm.predict(x_test)

np.mean(predicted_gs_clf_svm == y_test)

0.8548755884330868

In [22]:
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [24]:
# Stemming Code

import nltk

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(x_train, y_train)

predicted_mnb_stemmed = text_mnb_stemmed.predict(x_test)

np.mean(predicted_mnb_stemmed == y_test)

0.852320107599193

In [25]:
text_svm_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=10, random_state=42))])

text_svm_mnb_stemmed = text_svm_mnb_stemmed.fit(x_train, y_train)

predicted_svm_mnb_stemmed = text_svm_mnb_stemmed.predict(x_test)

np.mean(predicted_svm_mnb_stemmed == y_test)

0.8181573638197713

In [46]:
data[(data['class']==2)]

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
40,2,""" momma said no pussy cats inside my doghouse """
63,2,"""@Addicted2Guys: -SimplyAddictedToGuys http://..."
66,2,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo..."
67,2,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these..."
...,...,...
25249,2,yaya ho.. cute avi tho RT @ViVaLa_Ari I had no...
25250,2,yea so about @N_tel 's new friend.. all my fri...
25280,2,"you know what they say, the early bird gets th..."
25292,2,"you've gone and broke the wrong heart baby, an..."


In [27]:
len(data)

24783

In [47]:
20620 - 4163

16457

In [41]:
gs_clf_svm.predict(["nigger"])

array([1])