1. Understand the Problem Statement
2. Tweets Preprocessing and Cleaning
  1. Data Inspection
  2. Data Cleaning
3. Story Generation and Visualization from Tweets
4. Extracting Features from Cleaned Tweets
  1. Bag-of-Words
  2. TF-IDF
  3. Word Embeddings
5. Model Building: Sentiment Analysis
  1. Logistic Regression
  2. Support Vector Machine
  3. RandomForest
  4. XGBoost
6. Model Fine-tuning
7. Summary

In [None]:
# import libraries
import re
import nltk
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 200)
warnings.filterwarnings('ignore', category = DeprecationWarning)

%matplotlib inline


In [None]:
train = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')
test = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

## Data Inspection

In [None]:
train[train['label'] == 0].head()

In [None]:
train[train['label'] == 1].head()

In [None]:
train.shape, test.shape

In [None]:
train.label.value_counts()

In [None]:
length_train = train['tweet'].str.len()
lenght_test = test['tweet'].str.len()
plt.hist(length_train, bins= 20, label = 'Train_tweets')
plt.hist(lenght_test, bins = 20, label = 'test_tweets')
plt.legend()
plt.show()

In [None]:
# combine the data (train + test)
df = train.append(test, ignore_index=True)
df.shape

## Data Cleaning

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, "", input_txt)
    return input_txt

### Removing twitter Handles  
Note that we have passed “@[]*” as the pattern to the remove_pattern function. It is actually a regular expression which will pick any word starting with ‘@’.

In [None]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df.head()

### Removing Punctuations, Numbers, and Special Characters  
Here we will replace everything except characters and hashtags with spaces. The regular expression “[^a-zA-Z#]” means anything except alphabets and ‘#’.

In [None]:
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
df.head()

### Removing short words  
We have to be a little careful here in selecting the length of the words which we want to remove. So, I have decided to remove all the words having length 3 or less. For example, terms like “hmm”, “oh” are of very little use. It is better to get rid of them.


In [None]:
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

### Text Normalization  
Here we will use nltk’s PorterStemmer() function to normalize the tweets. But before that we will have to tokenize the tweets. Tokens are individual terms or words, and tokenization is the process of splitting a string of text into tokens.

In [None]:
tokenized_tweet = df['tidy_tweet'].apply(lambda x: x.split()) 
tokenized_tweet.head()

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
# Normalize the tokenized tweets
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [None]:
# stitch these tokens back together, using nltk's MosesDetokenizer function
for i in range(len(tokenized_tweet)):
  tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
df['tidy_tweet'] = tokenized_tweet

## 3. Story Generation and Visualization from Tweets

### A) Understanding the common words used in the tweets: WordCloud  
A wordcloud is a visualization wherein the most frequent words appear in large size and the less frequent words appear in smaller sizes.

In [None]:
df.head()

In [None]:
all_words = ' '.join([text for text in df['tidy_tweet']])

In [None]:
all_words

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width = 800, height = 500, random_state = 21, max_font_size = 110).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### B) Words in non racist tweets

In [None]:
normal_words = " ".join([text for text in df['tidy_tweet'][df['label'] == 0]])
wordcloud = WordCloud(width = 800, height = 500, random_state = 21, 
                      max_font_size = 110).generate(normal_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### C) Words in racist tweets

In [None]:
negative_words = " ".join([text for text in df['tidy_tweet'][df['label'] == 1]])
wordcloud = WordCloud(width = 800, height = 500, random_state = 21, 
                      max_font_size = 110).generate(negative_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### D) Understanding the impact of Hashtags on tweets sentiment

In [None]:
# function to collect hashtags 
def hashtag_extract(x):
    hashtags = []
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
    return hashtags

In [None]:
# extracting hashtags from non racist tweets
HT_regular = hashtag_extract(df['tidy_tweet'][df['label'] == 0])


In [None]:
# extracting hashtags from racist tweets
HT_negative = hashtag_extract(df['tidy_tweet'][df['label'] ==1])

In [None]:
# unnesting list 
HT_regular = sum(HT_regular, [])
HT_negative = sum(HT_negative, [])

In [None]:
# plot the top n hashtags
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})

**Non-Racist Tweets**

In [None]:
# selecting top 20 most frequent hashtags
d = d.nlargest(columns='Count', n = 20)
plt.figure(figsize=(16,5))
ax = sns.barplot(data = d, x = 'Hashtag', y= 'Count')
ax.set(ylabel = 'Count')
plt.show() 

**Racist Tweets**

In [None]:
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag': list(b.keys()),
                  'Count': list(b.values())})
e = e.nlargest(columns='Count', n = 20)
plt.figure(figsize=(16,5))
ax = sns.barplot(data = e, x = 'Hashtag', y= 'Count')
ax.set(ylabel = 'Count')
plt.show() 

## 4. Extracting Features from Cleaned Tweets

### 1. Bag-of-words Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

In [None]:
bow_vectorizer = CountVectorizer(max_df= 0.90, min_df= 2,max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['tidy_tweet'])
bow.shape

### 2. TF-IDF Features
TF-IDF works by penalising the common words by assigning them lower weights while giving importance to words which are rare in the entire corpus but appear in good numbers in few documents.

Let’s have a look at the important terms related to TF-IDF:

* TF = (Number of times term t appears in a document)/(Number of terms in the document)

* IDF = log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in.

* TF-IDF = TF*IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['tidy_tweet'])
tfidf.shape

### 3. Word2Vec Features  
Word embeddings are the modern way of representing words as vectors. The objective of word embeddings is to redefine the high dimensional word features into low dimensional feature vectors by preserving the contextual similarity in the corpus. They are able to achieve tasks like King -man +woman = Queen, which is mind-blowing.   
The advantages of using word embeddings over BOW or TF-IDF are:

* Dimensionality reduction - significant reduction in the no. of features required to build a model.

* It capture meanings of the words, semantic relationships and the different types of contexts they are used in.


In [None]:
tokenized_tweet = df['tidy_tweet'].apply(lambda x: x.split())
model_w2v = gensim.models.Word2Vec(tokenized_tweet, 
                                   size = 200, 
                                   window = 5, 
                                   min_count = 2, 
                                   sg = 1, 
                                   hs = 0, 
                                   negative = 10, 
                                   workers = 2, 
                                   seed = 34)
model_w2v.train(tokenized_tweet, total_examples = len(df['tidy_tweet']), epochs =20)

In [None]:
# We will specify a word and the model will pull out the most similar words from the corpus.
model_w2v.wv.most_similar(positive = 'dinner')

we can see that our word2vec model does a good job of finding the most similar words for a given word. But how is it able to do so? That’s because it has learned vectors for every unique word in our data and it uses cosine similarity to find out the most similar vectors (words).

**Preparing Vectors for Tweets**  
Since our data contains tweets and not just words, we’ll have to figure out a way to use the word vectors from word2vec model to create vector representation for an entire tweet. There is a simple solution to this problem, we can simply take mean of all the word vectors present in the tweet. The length of the resultant vector will be the same, i.e. 200. We will repeat the same process for all the tweets in our data and obtain their vectors. Now we have 200 word2vec features for our data.

We will use the below function to create a vector for each tweet by taking the average of the vectors of the words present in the tweet.

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
        except KeyError:
            continue
    if count != 0: 
        vec /= count
    return vec


In [None]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 200))
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

Now we have 200 new features, whereas in Bag of Words and TF-IDF we had 1000 features.

### 4. Doc2Vec Embedding
 Doc2Vec model is an unsupervised algorithm to generate vectors for sentence/paragraphs/documents. This approach is an extension of the word2vec. The major difference between the two is that doc2vec provides an additional context which is unique for every document in the corpus. This additional context is nothing but another feature vector for the whole document. This document vector is trained along with the word vectors.

In [None]:
from tqdm import tqdm
tqdm.pandas(desc = 'progress-bar')
from gensim.models.doc2vec import LabeledSentence

To implement doc2vec, we have to labelise or tag each tokenised tweet with unique IDs. We can do so by using Gensim’s LabeledSentence() function.

In [None]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(LabeledSentence(s, ['tweet_' + str(i)]))
    return output


In [None]:
labeled_tweets = add_label(tokenized_tweet)

In [None]:
labeled_tweets[:6]

In [None]:
# Now let's train a doc2vec model
model_d2v = gensim.models.Doc2Vec(dm=1, 
                                  dm_mean = 1, 
                                  size = 200, 
                                  window = 5, 
                                  negative = 7, 
                                  min_counts = 5, 
                                  workers = 3, 
                                  alpha = 0.1, 
                                  seed = 23)
model_d2v.build_vocab([i for i in tqdm(labeled_tweets)])
model_d2v.train(labeled_tweets, total_examples = len(df['tidy_tweet']), epochs = 15)

In [None]:
# Preparing doc2vec feature set
docvec_arrays = np.zeros((len(tokenized_tweet), 200))
for i in range(len(df)):
    docvec_arrays[i, :] = model_d2v.docvecs[i].reshape((1, 200))
docvec_df = pd.DataFrame(docvec_arrays)
docvec_df.shape

## 5. Model Building: Sentiment Analysis

### 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

#### Bag-of-words features

In [None]:
train_bow = bow[:31962, :]
test_bow = bow[31962:, :]

xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'],
                                                          random_state=42,test_size=0.3)

In [None]:
lreg = LogisticRegression()
lreg.fit(xtrain_bow, ytrain)
prediction = lreg.predict_proba(xvalid_bow)
prediction_int = prediction[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

In [None]:
# Now let's make predictions for the test dataset and create a submission file

test_pred = lreg.predict_proba(test_bow)
test_pred_int = test_pred[:, 1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test['label'] = test_pred_int
submission = test[['id', 'label']]
submission.to_csv('sub_lreg_bow.csv', index = False)

#### TF-IDF features

In [None]:
train_tfidf = tfidf[:31962, :]
test_tfidf = tfidf[31962:, :]

xtrain_tfidf = train_tfidf[ytrain.index]
xvalid_tfidf = train_tfidf[yvalid.index]

lreg.fit(xtrain_tfidf, ytrain)
prediction = lreg.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

#### Word2Vec Features

In [None]:
ytrain.count() , yvalid.count() 

In [None]:
wordvec_df.isnull().any().sum()

In [None]:
train_w2v = wordvec_df.iloc[:31962, :]
test_w2v = wordvec_df.iloc[31962:, :]

xtrain_w2v = train_w2v.iloc[ytrain.index, :]
xvalid_w2v = train_w2v.iloc[yvalid.index, :]

lreg.fit(xtrain_w2v, ytrain)
prediction = lreg.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1] >=0.3
prediction_int= prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

#### Doc2Vec features

In [None]:
train_d2v = docvec_df.iloc[:31962, :]
test_d2v = docvec_df.iloc[31962:, :]

xtrain_d2v = train_d2v.iloc[ytrain.index, :]
xvalid_d2v = train_d2v.iloc[yvalid.index, :]

lreg.fit(xtrain_d2v, ytrain)
prediction = lreg.predict_proba(xvalid_d2v)
prediction_int = prediction[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

### 2. Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel = 'linear', C = 1, probability = True)

#### Bag-of-words features

In [None]:
svc.fit(xtrain_bow, ytrain)

prediction = svc.predict_proba(xvalid_bow)
prediction_int = prediction[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

In [None]:
# Creating submission file

test_pred = svc.predict_proba(test_bow)
test_pred_int = test_pred[:, 1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)

test['label'] = test_pred_int
submission = test[['id', 'label']]
submission.to_csv('sub_svm_bow.csv', index = False)

General function for all the model implementation

In [None]:
def model_apply(model, training_data, validation_data):
    model.fit(training_data, ytrain)
    prediction = model.predict_proba(validation_data)
    prediction_int = prediction[:, 1] >= 0.3
    prediction_int = prediction_int.astype(np.int)
    f1_scor = f1_score(yvalid, prediction_int)
    print(f1_scor)

#### TF-IDF features

In [None]:
model_apply(svc, xtrain_tfidf, xvalid_tfidf)

#### Word2Vec Features

In [None]:
model_apply(svc, xtrain_w2v, xvalid_w2v)

#### Doc2Vec features

In [None]:
model_apply(svc, xtrain_d2v, xvalid_d2v)

### 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 400, random_state = 11)

In [None]:
# General function
def model_imp(model, training_data, validation_data):
    model.fit(training_data, ytrain)
    prediction = model.predict(validation_data)
    print(f1_score(yvalid, prediction))

#### Bag-of-words features

In [None]:
model_imp(rf, xtrain_bow, xvalid_bow)

In [None]:
# submission file
test_pred = rf.predict(test_bow)
test['label'] = test_pred
submission = test[['id', 'label']]
submission.to_csv('sub_rf_bow.csv', index =False)

#### TF-IDF features

In [None]:
model_imp(rf, xtrain_tfidf, xvalid_tfidf)

#### Word2Vec Features

In [None]:
model_imp(rf, xtrain_w2v, xvalid_w2v)

#### Doc2Vec Features

In [None]:
model_imp(rf, xtrain_d2v, xvalid_d2v)

### 3. XGBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(max_depth = 6, n_estimators = 1000)

#### Bag-of-words features

In [None]:
model_imp(xgb, xtrain_bow, xvalid_bow)

In [None]:
# submission file
test_pred = xgb.predict(test_bow)
test['label'] = test_pred
submission = test[['id', 'label']]
submission.to_csv('sub_xgb_bow.csv', index=False)

#### TF-IDF features

In [None]:
model_imp(xgb, xtrain_tfidf, xvalid_tfidf)

#### Word2Vec features

In [None]:
model_imp(xgb, xtrain_w2v, xvalid_w2v)

Best performance till now

#### Doc2vec features

In [None]:
model_imp(xgb, xtrain_d2v, xvalid_d2v)

## Fine Tuning XGBoost + Word2Vec

In [None]:
import xgboost as xgb

A DMatrix can contain both the features and the target

In [None]:
dtrain = xgb.DMatrix(xtrain_w2v, label = ytrain)
dvalid = xgb.DMatrix(xvalid_w2v, label = yvalid)
dtest = xgb.DMatrix(test_w2v)

In [None]:
# Parameters that we are going to tune
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'min_child_weight': 1, 
    'eta': 0.3,
    'subsample': 1, 
    'colsample_bytree': 1
}

In [None]:
# custom evaluation metric to calculate f1 score

def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = (preds >= 0.3).astype(np.int)
    return[('f1_score', f1_score(labels, preds))]

We will follow the steps below to tune the parameters.  

* Choose a relatively high learning rate. Usually a learning rate of 0.3 is used at this stage.

* Tune tree-specific parameters such as max_depth, min_child_weight, subsample, colsample_bytree keeping the learning rate fixed.

* Tune the learning rate.

* Finally tune gamma to avoid overfitting.

In [None]:
# Tuning max_depth and min_child_weight
gridsearch_params = [
                     (max_depth, min_child_weight)
                     for max_depth in range(6,10)
                     for min_child_weight in range(5,8)]

max_f1 = 0
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print('CV with max_depth = {}, min_child_weight ={}'.format(
      max_depth, min_child_weight))
  


    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

#Cross-validation
    cv_results = xgb.cv(params, dtrain, feval = custom_eval, 
                    num_boost_round = 200, 
                    maximize = True, 
                    seed = 16, 
                    nfold =5, 
                    early_stopping_rounds = 10)

# Finding best F1 Score

    mean_f1 = cv_results['test-f1_score-mean'].max()

    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1  
        best_params = (max_depth, min_child_weight)
print('Best params: {}, {}, F1 Score: {}'.format(best_params[0], best_params[1], max_f1))

In [None]:
params['max_depth'] = 8
params['min_child_weight'] = 6

In [None]:
# Tuning subsample and colsample
gridsearch_params = [
                     (subsample, colsample)
                     for subsample in [i/10 for i in range(5,10)]
                     for colsample in [i/10 for i in range(5,10)]
]
max_f1 = 0
best_params = None
for subsample, colsample in gridsearch_params:
    print("CV with subsample = {}, colsample = {}".format(
        subsample, colsample
  ))
  # Update our parameters
    params['colsample'] = colsample
    params['subsample'] = subsample
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )
     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample, colsample) 

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

In [None]:
# Tuning the learning rate

max_f1 = 0. 
best_params = None 
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
     # Update ETA
    params['eta'] = eta

     # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=1000,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=20
    )

     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = eta 
print("Best params: {}, F1 Score: {}".format(best_params, max_f1))

In [None]:
# finally tuned parameters
params = { 
 'colsample_bytree': 0.5, 'eta': 0.1,
 'max_depth': 8, 'min_child_weight': 7,
 'objective': 'binary:logistic',
 'subsample': 0.9}



In [None]:

xgb_model = xgb.train(
    params,
    dtrain,
    feval= custom_eval,
    num_boost_round= 1000,
    maximize=True,
    evals=[(dvalid, "Validation")],
    early_stopping_rounds=10
 )

In [None]:
# Final submission file
test_pred = xgb_model.predict(dtest)
test['label'] = (test_pred >= 0.3).astype(np.int)
submission = test[['id', 'label']]
submission.to_csv('sub_xgb_w2v_fintuned.csv',index = False)