# Import the libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn import metrics

# Importing the dataset

In [2]:
data = pd.read_csv('sentiment.tsv', sep='\t')
data.head(1)

Unnamed: 0,label,text
0,neg,"@jamielewislewis i cant believe it, it really ..."


# Cleaning the dataset

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
data.head()

Unnamed: 0,label,text
0,0,"@jamielewislewis i cant believe it, it really ..."
1,1,having a vodka tonic and looking forward to go...
2,1,@ddlovatofans1neg1 Could you follow me please....
3,1,@jordanknight for once.................. PLEAS...
4,0,Had a dream about a walk in fast food resturau...


# Remove twitter handles(@user)

In [4]:
def remove_pattern(input_text, pattern):
    r = re.findall(pattern, input_text)
    for i in r:
        input_text = re.sub(i, '', input_text)
    return input_text

data['tidy_text'] = np.vectorize(remove_pattern)(data['text'], '@[\w]*') 
data.head(5)

Unnamed: 0,label,text,tidy_text
0,0,"@jamielewislewis i cant believe it, it really ...","i cant believe it, it really doesnt belong th..."
1,1,having a vodka tonic and looking forward to go...,having a vodka tonic and looking forward to go...
2,1,@ddlovatofans1neg1 Could you follow me please....,Could you follow me please.I would really app...
3,1,@jordanknight for once.................. PLEAS...,for once.................. PLEASE TELL US WHY...
4,0,Had a dream about a walk in fast food resturau...,Had a dream about a walk in fast food resturau...


# Remove special characters, numbers, punctuations

In [5]:
data['tidy_text'] = data['tidy_text'].str.replace(r'[^a-zA-Z#]', ' ')
data.head()

Unnamed: 0,label,text,tidy_text
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believe it it really doesnt belong th...
1,1,having a vodka tonic and looking forward to go...,having a vodka tonic and looking forward to go...
2,1,@ddlovatofans1neg1 Could you follow me please....,Could you follow me please I would really app...
3,1,@jordanknight for once.................. PLEAS...,for once PLEASE TELL US WHY...
4,0,Had a dream about a walk in fast food resturau...,Had a dream about a walk in fast food resturau...


# Tokenize the tweets

In [6]:
tokenized_tweet = data['tidy_text'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [i, cant, believe, it, it, really, doesnt, bel...
1    [having, a, vodka, tonic, and, looking, forwar...
2    [Could, you, follow, me, please, I, would, rea...
3    [for, once, PLEASE, TELL, US, WHY, u, were, th...
4    [Had, a, dream, about, a, walk, in, fast, food...
Name: tidy_text, dtype: object

In [7]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
stemmed_text = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
stemmed_text.head(5)

0    [i, cant, believ, it, it, realli, doesnt, belo...
1    [have, a, vodka, tonic, and, look, forward, to...
2    [could, you, follow, me, pleas, I, would, real...
3    [for, onc, pleas, tell, US, whi, u, were, thin...
4    [had, a, dream, about, a, walk, in, fast, food...
Name: tidy_text, dtype: object

# Joining the tokenized word in the same data

In [8]:
for i in range(len(stemmed_text)):
        stemmed_text[i] = ' '.join(stemmed_text[i])
data['tidy_text'] = stemmed_text
data.head()

Unnamed: 0,label,text,tidy_text
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believ it it realli doesnt belong there...
1,1,having a vodka tonic and looking forward to go...,have a vodka tonic and look forward to go to s...
2,1,@ddlovatofans1neg1 Could you follow me please....,could you follow me pleas I would realli appre...
3,1,@jordanknight for once.................. PLEAS...,for onc pleas tell US whi u were think of thi ...
4,0,Had a dream about a walk in fast food resturau...,had a dream about a walk in fast food resturau...


# Adding other column for length of the tweet and punctuation

In [9]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(' ')),3)*100

In [10]:
data['body_len'] = data['tidy_text'].apply(lambda x: len(x)-x.count(' '))
data['body_len'].replace(0, np.nan, inplace=True)
data.dropna(inplace=True)
data['punc%'] = data['tidy_text'].apply(lambda x: count_punct(x))
data.head()

Unnamed: 0,label,text,tidy_text,body_len,punc%
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believ it it realli doesnt belong there...,91.0,0.0
1,1,having a vodka tonic and looking forward to go...,have a vodka tonic and look forward to go to s...,79.0,0.0
2,1,@ddlovatofans1neg1 Could you follow me please....,could you follow me pleas I would realli appre...,42.0,0.0
3,1,@jordanknight for once.................. PLEAS...,for onc pleas tell US whi u were think of thi ...,41.0,0.0
4,0,Had a dream about a walk in fast food resturau...,had a dream about a walk in fast food resturau...,91.0,0.0


In [11]:
def hashtag_extract(x):
    hashtag = []
    for i in x:
        ht = re.findall(r'#(\w+)', i)
        hashtag.append(ht)
    return hashtag

In [12]:
# extract hash from no racist/sexist tweets
HT_regular = hashtag_extract(data['tidy_text'][data['label']==0])
# positive
HT_negative = hashtag_extract(data['tidy_text'][data['label']==1])
HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [13]:
d = d.nlargest(columns = "count",n = 10)
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d,x = 'Hashtag', y = "count")
ax.set(ylabel = 'Count')
plt.show()

NameError: name 'd' is not defined

In [None]:
a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag':list(a.keys()),
                 'count':list(a.values())})
d = d.nlargest(columns = "count",n = 10)
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d,x = 'Hashtag', y = "count")
ax.set(ylabel = 'Count')
plt.show()

 # Feature selection and engineering

# Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english')
vect = count_vect.fit_transform(data['tidy_text'])
new_vect_df = pd.concat([data['body_len'], data['punc%'], pd.DataFrame(vect.toarray())], axis=1)
new_vect_df.head()

# TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english')
vect = tfidf_vect.fit_transform(data['tidy_text'])
new_tfidf_vect = pd.concat([data['body_len'],data['punc%'],pd.DataFrame(vect.toarray())],axis = 1)
new_tfidf_vect.head()