In [1]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from string import punctuation
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
data = pd.read_csv('TwitterHate.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
tweets_list = data['tweet'].tolist()

In [5]:
tweets_list[1:5]

["@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [6]:
tweet_lower = [tweet.lower() for tweet in tweets_list]

In [7]:
tweet_lower[1]

"@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked"

In [8]:
##removing url, user handles, #

tweet_user_rem = [re.sub("@\w+\s","", tweet) for tweet in tweet_lower]
tweet_hash_rem = [re.sub("#+","", tweet) for tweet in tweet_user_rem]
tweet_url_rem = [re.sub("\w://\S","", tweet) for tweet in tweet_hash_rem]

In [9]:
tweet_url_rem[1:3]

["thanks for lyft credit i can't use cause they don't offer wheelchair vans in pdx.    disapointed getthanked",
 '  bihday your majesty']

In [10]:
token = TweetTokenizer()

In [11]:
tokenized_tweet = [token.tokenize(tweet) for tweet in tweet_url_rem]

In [12]:
tokenized_tweet[0:2]

[['when',
  'a',
  'father',
  'is',
  'dysfunctional',
  'and',
  'is',
  'so',
  'selfish',
  'he',
  'drags',
  'his',
  'kids',
  'into',
  'his',
  'dysfunction',
  '.',
  'run'],
 ['thanks',
  'for',
  'lyft',
  'credit',
  'i',
  "can't",
  'use',
  'cause',
  'they',
  "don't",
  'offer',
  'wheelchair',
  'vans',
  'in',
  'pdx',
  '.',
  'disapointed',
  'getthanked']]

In [13]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
#removing stopwords and punctuation
stop_words = stopwords.words('english')
punct = list(punctuation)
stop_context = ['rt', 'amp', '...', '```', ]
stop_terms = stop_words + punct + stop_context
words_clean1 = [[word for word in tokens if word not in stop_terms] for tokens in tokenized_tweet]

In [15]:
words_clean1[0:2]

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run'],
 ['thanks',
  'lyft',
  'credit',
  "can't",
  'use',
  'cause',
  'offer',
  'wheelchair',
  'vans',
  'pdx',
  'disapointed',
  'getthanked']]

In [16]:
#Extra cleanup by removing terms with a length of 1.
words_clean = [[word for word in tokens if len(word) > 1] for tokens in words_clean1]


In [17]:
words_clean[0:2]

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run'],
 ['thanks',
  'lyft',
  'credit',
  "can't",
  'use',
  'cause',
  'offer',
  'wheelchair',
  'vans',
  'pdx',
  'disapointed',
  'getthanked']]

In [18]:
#Check out the top terms in the tweets:
#First, get all the tokenized terms into one large list.
word_list = []
for words in words_clean:
    word_list.extend(words)
word_list[0:2]
#Use the counter and find the 10 most common terms.
counter = Counter(word_list)
counter.most_common(10)


[('love', 2680),
 ('day', 2271),
 ('happy', 1683),
 ('time', 1128),
 ('life', 1103),
 ('like', 1097),
 ("i'm", 1018),
 ('today', 1008),
 ('new', 988),
 ('positive', 929)]

In [19]:
#Data formatting for predictive modeling:
#Join the tokens back to form strings. This will be required for the vectorizers.
cleaned_tweets = [" ".join(tweets) for tweets in words_clean]
#Assign x and y.
X = cleaned_tweets
y = data['label'].tolist()
#Perform train_test_split using sklearn.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
#use TF-IDF values for the terms as a feature to get into a vector space model.
#Instantiate with a maximum of 5000 terms in your vocabulary.
vectorizer = TfidfVectorizer(max_features=5000)
print(len(X_train), len(X_test))
#Fit and apply on the train set.
X_train_vector = vectorizer.fit_transform(X_train)
#Apply on the test set.
X_test_vector = vectorizer.fit_transform(X_test)
print('vector dimensions X.train :', X_train_vector.shape, 'X.test :', X_test_vector.shape)

21414 10548
vector dimensions X.train : (21414, 5000) X.test : (10548, 5000)


In [21]:
# Model building: Ordinary Logistic Regression
#Fit into  the train data.
logreg = LogisticRegression()
logregmodel = logreg.fit(X_train_vector, y_train)
#Make predictions for the train and the test set.
pred = logregmodel.predict(X_test_vector)



In [22]:
np.unique(pred)

array([0, 1])

In [23]:
print('accuracy_score', accuracy_score(y_test, pred))

accuracy_score 0.927284793325749


In [24]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      9806
           1       0.00      0.00      0.00       742

    accuracy                           0.93     10548
   macro avg       0.46      0.50      0.48     10548
weighted avg       0.86      0.93      0.89     10548



In [25]:
#fairly good results, we can also go for class imbalence adjustment
logreg_bal = LogisticRegression(class_weight = 'balanced')
logregmodel_bal = logreg_bal.fit(X_train_vector, y_train)
#Make predictions for the train and the test set.
pred = logregmodel_bal.predict(X_test_vector)

In [26]:
print('accuracy_score', accuracy_score(y_test, pred))

accuracy_score 0.8310580204778157


In [27]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      9806
           1       0.05      0.08      0.06       742

    accuracy                           0.83     10548
   macro avg       0.49      0.48      0.48     10548
weighted avg       0.87      0.83      0.85     10548



#### Not good results when using the imbalence adjustments

In [28]:
#Regularization and Hyperparameter tuning:

#Import GridSearch and StratifiedKFold because of class imbalance.
#Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.
#Use a balanced class weight while instantiating the logistic regression

param_grid = {'C': [0.01,0.1,1,10,100], 'penalty': ["l1","l2"]}
logreg_classifier = LogisticRegression(class_weight = 'balanced')
grid_search = GridSearchCV(estimator = logreg_classifier, param_grid = param_grid, cv = StratifiedKFold(4),
                           n_jobs=-1, scoring = 'recall')


In [29]:
grid_search.fit(X_train_vector, y_train)



GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [30]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
pred2 = grid_search.best_estimator_.predict(X_test_vector)

In [32]:
print('Accuracy_score', accuracy_score(y_test, pred2))

Accuracy_score 0.8310580204778157


In [33]:
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      9806
           1       0.05      0.08      0.06       742

    accuracy                           0.83     10548
   macro avg       0.49      0.48      0.48     10548
weighted avg       0.87      0.83      0.85     10548

