Get and explore the data

the "twitter_sentiments.csv" data is got from www.analyticsvidhya.com/blog/2018/07/hands-on-sentiment-analysis-dataset-python/

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

df = pd.read_csv('twitter_sentiments.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


How much number for each of the hate speech and the non-hate speech

Prepare the data

In [2]:
train_set, test_set = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=21)

print('train_set.shape =', train_set.shape)
print('test_set.shape =', test_set.shape)

train_set.shape = (25569, 3)
test_set.shape = (6393, 3)


In [3]:
train_set

Unnamed: 0,id,label,tweet
11221,11222,0,i am trusting. #i_am #positive #affirmation
8977,8978,0,trump vs clinton...smdh...america really is a ...
26800,26801,0,my ð&amp;ð go out 2the victims &amp;fami...
28522,28523,0,#rainbowrowell bull up: you will dominate yo...
5811,5812,0,@user @user just ordered two spf15 with bronz...
...,...,...,...
28230,28231,0,@user @user how would you get trapped in a gla...
13957,13958,0,happy father's day! #dad #father'sday
16417,16418,0,@user i can take you right across this contin...
7185,7186,0,our book to accompany exhibition. get your own...


In [4]:
test_set

Unnamed: 0,id,label,tweet
11420,11421,0,13 days!!! #love #life #reunited #countdown
12226,12227,0,"@user ðpathetic, selfish &amp; disrespectfu..."
22901,22902,0,"thomas always says i live in a dream world, no..."
16449,16450,1,@user #allahsoil one infamous maneuver has aff...
2769,2770,0,#fashion it is a true #fact
...,...,...,...
3652,3653,0,#makaveli #day happy #bday #pac performi...
25032,25033,0,we love you back ð!
30787,30788,0,smile because every day is a good day ! #pimpj...
20390,20391,0,melody #instadog #myworld #park #dogsofinst...


In [5]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, max_features=1000, stop_words=list(ENGLISH_STOP_WORDS))

tfidf_vectorizer

In [6]:
train_tfidf = tfidf_vectorizer.fit_transform(train_set['tweet'])

train_tfidf

<25569x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 105309 stored elements in Compressed Sparse Row format>

In [7]:
test_tfidf = tfidf_vectorizer.transform(test_set['tweet'])

test_tfidf

<6393x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 26118 stored elements in Compressed Sparse Row format>

Fit the model

In [8]:
logistic_reg = LogisticRegression()
logistic_reg.fit(train_tfidf, train_set['label'])

In [9]:
train_pred = logistic_reg.predict(train_tfidf)

train_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
test_pred = logistic_reg.predict(test_tfidf)

test_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
print('train_pred.shape =', train_pred.shape)
print('test_pred.shape =', test_pred.shape)

train_pred.shape = (25569,)
test_pred.shape = (6393,)


In [14]:
train_f1 = f1_score(y_true=train_set['label'], y_pred=train_pred)
test_f1 = f1_score(y_true=test_set['label'], y_pred=test_pred)

print('F1 score between ground-truth labels and predictions for train set =', train_f1)
print('F1 score between ground-truth labels and predictions for test set =', test_f1)

F1 score between ground-truth labels and predictions for train set = 0.4865731462925852
F1 score between ground-truth labels and predictions for test set = 0.45499181669394434


Build the pipeline

In [19]:
full_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, max_features=1000, stop_words=list(ENGLISH_STOP_WORDS))),
    ('model', LogisticRegression())
])

full_pipeline

In [20]:
full_pipeline.fit(train_set['tweet'], train_set['label'])

In [21]:
sample_text = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds"]

full_pipeline.predict(sample_text)

array([0], dtype=int64)

Save the pipeline

In [23]:
import joblib

joblib.dump(full_pipeline, filename='hate speech classification.joblib')

['hate speech classification.joblib']

Use the save pipeline

In [24]:
save_full_pipeline = joblib.load('hate speech classification.joblib')

save_full_pipeline

In [25]:
save_full_pipeline.predict(sample_text)

array([0], dtype=int64)