In [None]:
import re
import nltk
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option("display.max_colwidth",200)
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
train_tweets = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')
test_tweets = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

In [None]:
train_tweets.head()

In [None]:
train_tweets.info()

In [None]:
sns.countplot(data=train_tweets, x='label', hue='label')
plt.title('Types of comments : 0 - > Non Rasict/Sexist , 1 - > Rasict/Sexist')
plt.xlabel('Tweets')
plt.show()

Rasict & Sexist Tweets

In [None]:
train_tweets[train_tweets['label']==1].head()

Not Rasict & Sexist Tweets

In [None]:
train_tweets[train_tweets['label']==0].head()

In [None]:
train_tweets['label'].value_counts()

In [None]:
test_tweets.head()

Distribution of length of the tweets, in terms of words, in both train and test data.

In [None]:
train_len = train_tweets['tweet'].str.len()
test_len = test_tweets['tweet'].str.len()

In [None]:
print("train data length :" , train_len)
print("test data length :" , test_len)

In [None]:
plt.hist(train_len, bins=20,label='train_tweets')
plt.hist(test_len , bins=20, label='test_tweets')
plt.legend()
plt.show()

In [None]:
dataset = train_tweets.append(test_tweets,ignore_index=True)

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
def remove_pattern(input_text,pattern):
    r = re.findall(pattern, input_text)
    for i in r:
        input_text = re.sub(i,"",input_text)
    return input_text

In [None]:
dataset['tidy_tweet'] = np.vectorize(remove_pattern)(dataset['tweet'],"@[\w]*")

In [None]:
dataset.head()

In [None]:
dataset['tidy_tweet'] = dataset['tidy_tweet'].str.replace('[^a-zA-Z#]'," ")

In [None]:
dataset.head()

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
stop_words[:10]

In [None]:
def remove_stopword(input_text):
    txt_clean = " ".join([word for word in input_text.split() if len(word)>3])
    return txt_clean

In [None]:
dataset['tidy_tweet'] = dataset['tidy_tweet'].apply(lambda x:remove_stopword(x))

In [None]:
dataset.head()

# Text Normalization

In [None]:
tokenized_tweet = dataset['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

# Stemming

In [None]:
from nltk.stem import PorterStemmer

In [None]:
pstem = PorterStemmer()

In [None]:
tokenized_tweet = tokenized_tweet.apply(lambda x:[pstem.stem(i) for i in x])

In [None]:
tokenized_tweet

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
dataset['tidy_tweet'] = tokenized_tweet

In [None]:
dataset.head()

# Visualization

In [None]:
from wordcloud import WordCloud
all_words = ' '.join([text for text in dataset['tidy_tweet']])  
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

**Words in non racist/sexist tweets**

In [None]:
all_words = ' '.join([text for text in dataset['tidy_tweet'][dataset['label']==0]])  
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

**Words in racist/sexist tweets**

In [None]:
all_words = ' '.join([text for text in dataset['tidy_tweet'][dataset['label']==1]])  
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [None]:
bow_vector = CountVectorizer(max_df=0.90,min_df=2,max_features=1000,stop_words='english')
bow = bow_vector.fit_transform(dataset['tidy_tweet'])
bow.shape

In [None]:
bow.data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

In [None]:
X = bow[:31962,:]

In [None]:
y = bow[31962:,:]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,train_tweets['label'],test_size=0.3)

In [None]:
lg = LogisticRegression()

In [None]:
lg.fit(x_train,y_train)

In [None]:
pred = lg.predict_proba(x_test)

In [None]:
pred

In [None]:
pred_int = pred[:,1]>=0.3

In [None]:
pred_int = pred_int.astype(np.int)

In [None]:
pred_int

In [None]:
f1_score(y_test,pred_int)