In [None]:
import numpy as np, seaborn as sns, pandas as pd, matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv(r"../input/twitter-sentiment-analysis-hatred-speech/train.csv")
test = pd.read_csv(r"../input/twitter-sentiment-analysis-hatred-speech/test.csv")

In [None]:
train.head()

In [None]:
#check non racist/sexist tweets

train[train['label']==0].head(10)

In [None]:
#check racist/sexist tweets

train[train['label']==1].head(10)

In [None]:
train['label'].value_counts()

In [None]:
train['tweet_length'] = [len(x) for x in train['tweet']]

In [None]:
#checking the average length of tweet per category

train.groupby('label')['tweet_length'].mean()

Data Visualization

In [None]:
sns.countplot(train['label'])

In [None]:
sns.barplot(train['label'],train['tweet_length'])

In [None]:
sns.distplot(train['tweet_length'])

In [None]:
sns.distplot(train['tweet_length'][train['label']==0])

In [None]:
sns.distplot(train['tweet_length'][train['label']==1])

Data cleaning and feature extraction

In [None]:
#combining the two datsets for cleaning

#combi = train.drop('tweet_length',axis=1).append(test,ignore_index=True)

In [None]:
import re

In [None]:
#define function to remove unwanted patterns in tweets

def rmv_pat(text,pattern):
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)
    return text 

In [None]:
train['tidy_tweet'] = np.vectorize(rmv_pat)(train['tweet'],"@[\w]*")

In [None]:
train.head()

In [None]:
#replacing everything except characters and hashtags with spaces

train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
train.head()

In [None]:
#removing short words

train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
train.head()

In [None]:
#define overall function to clean tweet

def clean_tweet(tweet,data):
    data['tidy_tweet'] = np.vectorize(rmv_pat)(tweet,"@[\w]*")
    data['tidy_tweet'] = data['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
    data['tidy_tweet'] = data['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    return data['tidy_tweet']


In [None]:
test['tidy_tweet'] = clean_tweet(test['tweet'],test)

In [None]:
test['tidy_tweet'].count()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
import gensim

In [None]:
#shuffle train set

shuffle = np.random.permutation(31962)
X_train = train['tidy_tweet'][shuffle]
y_train = train['label'][shuffle]

#shuffle test set
shuffle2 = np.random.permutation(17197)
X_test = test['tidy_tweet'][shuffle2]

In [None]:
#extracting word features using TF-IDF Vectorizer

tfidf1 = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english').fit_transform(X_train)

In [None]:
#extracting word features using bag of words

bow = CountVectorizer(stop_words='english',analyzer='word').fit_transform(X_train)

#extracting word features using TF-IDF transformer with bag of words

tfidf2 = TfidfTransformer().fit_transform(bow)

#using word2vec

tokenized_tweet = X_train.apply(lambda x: x.split()) # tokenizing 
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_tweet, total_examples= len(X_train), epochs=20)

def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary                                     
            continue
    if count != 0:
        vec /= count
    return vec

wordvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.neural_network import MLPClassifier
mlp_scores = cross_val_score(MLPClassifier(),X=tfidf1,y=y_train,cv=3,scoring='recall').mean()
print('MLP score with tfidf1: ',mlp_scores)
print('\n')

from sklearn.neural_network import MLPClassifier
mlp_scores = cross_val_score(MLPClassifier(),X=bow,y=y_train,cv=3,scoring='recall').mean()
print('MLP score with bow: ',mlp_scores)
print('\n')

from sklearn.neural_network import MLPClassifier
mlp_scores = cross_val_score(MLPClassifier(),X=tfidf2,y=y_train,cv=3,scoring='recall').mean()
print('MLP score with tfidf2: ',mlp_scores)
print('\n')

from sklearn.neural_network import MLPClassifier
mlp_scores = cross_val_score(MLPClassifier(),X=wordvec_df,y=y_train,cv=3,scoring='recall').mean()
print('MLP score with word2vec: ',mlp_scores)
print('\n')

Create Pipeline to pass data through feature extractors and classifier

In [None]:
pipe =  Pipeline([('bow', CountVectorizer(stop_words='english',analyzer='word')),
                 ('estimator', MLPClassifier())
                ])

pipe.fit(X_train,y_train)

prediction = pipe.predict(X_test)

In [None]:
prediction[500]

In [None]:
X_test[500]