In [1]:
import pandas as pd
import numpy as mp
import os
import re

In [2]:
tweets = pd.read_csv('TwitterHate.csv')
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
tweets.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [4]:
tweets.tweet.sample().values[0]

'#cotd   polar bear climb racing: angry polar bear climb racing, the polar bear living in cold places looking '

In [5]:
tweets0 = tweets.tweet.values
len(tweets0)

31962

In [6]:
tweets0[:5]

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
       ' factsguide: society now    #motivation'], dtype=object)

# Cleanup

In [7]:
tweets_lower = [twt.lower() for twt in tweets0]

In [8]:
tweets_lower[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [9]:
re.sub("@\w+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

' this course rocks! http://rahimbaig.com/ai'

In [10]:
tweets_nouser = [re.sub("@\w+","", twt) for twt in tweets_lower]

In [11]:
tweets_nouser[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [12]:
# Removing URLs

re.sub("\w+://\S+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

'@Rahim this course rocks! '

In [13]:
tweets_nourl = [re.sub("\w+://\S+","", twt) for twt in tweets_nouser]
tweets_nourl[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [14]:
# tokenize using TweetTokenizer

from nltk.tokenize import TweetTokenizer
?TweetTokenizer()

Object `TweetTokenizer()` not found.


In [15]:
tkn = TweetTokenizer()
print(tkn.tokenize(tweets_nourl[0]))

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


In [16]:
tweet_token = [tkn.tokenize(sent) for sent in tweets_nourl]
print(tweet_token[0])

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


In [17]:
# Remove stopwords

from nltk.corpus import stopwords
from string import punctuation

stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

stop_punct.extend(['...','``',"''",".."])

stop_context = ['rt', 'amp']

stop_final = stop_nltk + stop_punct + stop_context

## Extra cleaup, remving terms with length of 1

In [18]:
def del_stop(sent):
    return [re.sub("#","",term) for term in sent if ((term not in stop_final) & (len(term)>1))]

del_stop(tweet_token[4])

['factsguide', 'society', 'motivation']

In [19]:
tweets_clean = [del_stop(tweet) for tweet in tweet_token]

## Check out top terms in Tweet

In [20]:
from collections import Counter

term_list = []
for tweet in tweets_clean:
    term_list.extend(tweet)
    
    
res = Counter(term_list)
res.most_common(10)

[('love', 2748),
 ('day', 2276),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013),
 ('new', 994),
 ('thankful', 946)]

# Data formatting for predictive modelling

In [21]:
tweets_clean[0]

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [22]:
tweets_clean = [" ".join(tweet) for tweet in tweets_clean]
tweets_clean[0]

'father dysfunctional selfish drags kids dysfunction run'

In [23]:
# train, test, split

X = tweets_clean
y = tweets.label.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

## Using TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 5000)

len(X_train), len(X_test)

(22373, 9589)

In [25]:
X_train_bow = vectorizer.fit_transform(X_train)

X_test_bow = vectorizer.transform(X_test)

X_train_bow.shape, X_test_bow.shape

((22373, 5000), (9589, 5000))

## Model Building: Ordinary Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train_bow, y_train)

LogisticRegression()

In [27]:
y_train_pred = lr.predict(X_train_bow)
y_test_pred = lr.predict(X_test_bow)

## Model evaluation: Accuracy, recall, and f_1 score

In [28]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_train, y_train_pred)

0.9560184150538595

In [29]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     20815
           1       0.96      0.39      0.55      1558

    accuracy                           0.96     22373
   macro avg       0.96      0.69      0.76     22373
weighted avg       0.96      0.96      0.95     22373



## Adjust the class imbalance

In [30]:
lr = LogisticRegression(class_weight="balanced")

lr.fit(X_train_bow, y_train)

LogisticRegression(class_weight='balanced')

In [31]:
y_train_pred = lr.predict(X_train_bow)
y_test_pred = lr.predict(X_test_bow)

In [32]:
accuracy_score(y_train, y_train_pred)

0.9527108568363652

In [33]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20815
           1       0.60      0.97      0.74      1558

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373



## Regularization and Hyperparameter tuning

In [34]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {
    'C': [0.01,0.1,1,10,100],
    'penalty': ["l1","l2"]
}

In [35]:
classifier_lr = LogisticRegression(class_weight="balanced")

grid_search = GridSearchCV(estimator = classifier_lr, param_grid = param_grid, 
                          cv = StratifiedKFold(4), n_jobs = -1, verbose = 1, scoring = "recall" )

In [36]:
grid_search.fit(X_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.7s finished


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

In [37]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced')

## Predict and evaluate using the best estimator

In [38]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [39]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8905
           1       0.49      0.77      0.60       684

    accuracy                           0.93      9589
   macro avg       0.73      0.85      0.78      9589
weighted avg       0.95      0.93      0.93      9589

