### Step 1:  Load the tweets file using read_csv function from Pandas package. 

In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_tweets = pd.read_csv('TwitterHate.csv')

### Step 2: Get the tweets into a list for easy text cleanup and manipulation.

In [3]:
df_tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
# removing unneccessary columns
df_tweets.drop(columns='id', axis=1, inplace=True)

In [5]:
tweet_list = df_tweets['tweet'].tolist()

In [6]:
print(type(tweet_list))
tweet_list[:6]

<class 'list'>


[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation',
 '[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  ']

### Step 3: To cleanup: 

#### 3.1 Normalize the casing.

In [7]:
df_tweets.shape

(31962, 2)

In [8]:
df_tweets.label.value_counts()/df_tweets.shape[0]*100

0    92.98542
1     7.01458
Name: label, dtype: float64

In [9]:
#converting the text to lower case
for i in range(len(tweet_list)):
    tweet_list[i] = tweet_list[i].lower()
    
tweet_list[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

#### 3.2 Using regular expressions, remove user handles. These begin with '@’.
#### AND
#### 3.3: removing URLs

In [10]:
#import re
#from nltk import TweetTokenizer as tt
#from nltk.corpus import stopwords
#import string
#stopwords = set(stopwords.words("english"))

#def remove_user_handel(text):
    #3.2: removing user handels that begins with '@'
 #   text = re.sub("([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)","", text)
    #3.3: removing URLs
  #  text =  re.sub(r'^https?:\/\/.*[\r\n]*',"", text)
   # text = re.sub('<.*?>+', '', text)
    #3.5:Remove stop words
    #3.6: Remove redundant terms like ‘amp’, ‘rt’, etc.
    #3.7: Remove ‘#’ symbols from the tweet while retaining the term.

    #3.4: Tweet Tokenizer
    #text = tt.tokenize(text)
    

In [11]:
import re
for i in range(len(tweet_list)):
    #3.2: removing user handels that begins with '@'
    tweet_list[i] = re.sub("([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)","", tweet_list[i])
    #3.3: removing URLs
    tweet_list[i] = re.sub(r'^https?:\/\/.*[\r\n]*',"", tweet_list[i])
    tweet_list[i] = re.sub('<.*?>+', '', tweet_list[i])
    tweet_list[i] = re.sub('urð*', '', tweet_list[i])
    tweet_list[i] = re.sub('ð', '', tweet_list[i])

In [12]:
tweet_list[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday yo majesty',
 '#model   i love u take with u all the time in \x9f\x93±!!! \x9f\x98\x99\x9f\x98\x8e\x9f\x91\x84\x9f\x91\x85\x9f\x92¦\x9f\x92¦\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

#### 3.4 Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.

In [13]:
from nltk import TweetTokenizer
tt = TweetTokenizer()
tweet_words_list = list()
for i in range(len(tweet_list)):
    words = tt.tokenize(tweet_list[i])
    tweet_words_list.append(words)
    
tweet_words_list[:]

[['when',
  'a',
  'father',
  'is',
  'dysfunctional',
  'and',
  'is',
  'so',
  'selfish',
  'he',
  'drags',
  'his',
  'kids',
  'into',
  'his',
  'dysfunction',
  '.',
  '#run'],
 ['thanks',
  'for',
  '#lyft',
  'credit',
  'i',
  "can't",
  'use',
  'cause',
  'they',
  "don't",
  'offer',
  'wheelchair',
  'vans',
  'in',
  'pdx',
  '.',
  '#disapointed',
  '#getthanked'],
 ['bihday', 'yo', 'majesty'],
 ['#model',
  'i',
  'love',
  'u',
  'take',
  'with',
  'u',
  'all',
  'the',
  'time',
  'in',
  '\x9f',
  '\x93',
  '±',
  '!',
  '!',
  '!',
  '\x9f',
  '\x98',
  '\x99',
  '\x9f',
  '\x98',
  '\x8e',
  '\x9f',
  '\x91',
  '\x84',
  '\x9f',
  '\x91',
  '\x9f',
  '\x92',
  '¦',
  '\x9f',
  '\x92',
  '¦',
  '\x9f',
  '\x92',
  '¦'],
 ['factsguide', ':', 'society', 'now', '#motivation'],
 ['[',
  '2/2',
  ']',
  'huge',
  'fan',
  'fare',
  'and',
  'big',
  'talking',
  'before',
  'they',
  'leave',
  '.',
  'chaos',
  'and',
  'pay',
  'disputes',
  'when',
  'they',
  'g

#### 3.5 Remove stopwords

In [14]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import string
from nltk import PorterStemmer
ps = PorterStemmer()

def remove_stopwords(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tweet_list2 = remove_stopwords(tweet_list)

In [15]:
tweet_list

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday yo majesty',
 '#model   i love u take with u all the time in \x9f\x93±!!! \x9f\x98\x99\x9f\x98\x8e\x9f\x91\x84\x9f\x91\x85\x9f\x92¦\x9f\x92¦\x9f\x92¦  ',
 ' factsguide: society now    #motivation',
 '[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  ',
 '  camping tomorrow        dannyâ\x80¦',
 "the next school year is the year for exams.\x9f\x98¯ can't think about that \x9f\x98\xad #school #exams   #hate #imagine #actorslife #revolutionschool #girl",
 'we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â\x80¦ ',
 "   welcome here !  i'm   it's so #gr8 ! ",
 ' â\x86\x9d #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #

In [16]:
tweet_list2

['',
 'father',
 'dysfunct',
 'selfish',
 'drag',
 'kid',
 'dysfunct',
 'run',
 'thank',
 'lyft',
 'credit',
 'use',
 'caus',
 'offer',
 'wheelchair',
 'van',
 'pdx',
 'disapoint',
 'getthank',
 'bihday',
 'yo',
 'majesti',
 'model',
 'love',
 'u',
 'take',
 'u',
 'time',
 'factsguid',
 'societi',
 'motiv',
 '2',
 '2',
 'huge',
 'fan',
 'fare',
 'big',
 'talk',
 'leav',
 'chao',
 'pay',
 'disput',
 'get',
 'allshowandnogo',
 'camp',
 'tomorrow',
 'dannyâ',
 'next',
 'school',
 'year',
 'year',
 'exam',
 'think',
 'school',
 'exam',
 'hate',
 'imagin',
 'actorslif',
 'revolutionschool',
 'girlw',
 'love',
 'land',
 'allin',
 'cav',
 'champion',
 'cleveland',
 'clevelandcavali',
 'â',
 'welcom',
 'gr8',
 'â',
 'ireland',
 'consum',
 'price',
 'index',
 'mom',
 'climb',
 'previou',
 '0',
 '2',
 '0',
 '5',
 'may',
 'blog',
 'silver',
 'gold',
 'forexw',
 'selfish',
 'orlando',
 'standwithorlando',
 'pulseshoot',
 'orlandoshoot',
 'biggerproblem',
 'selfish',
 'heabreak',
 'valu',
 'love',


#### 3.6 Remove redundant terms like ‘amp’, ‘rt’, etc
#### AND
#### 3.7 Remove ‘#’ symbols from the tweet while retaining the term



In [17]:
matchlist_amp = list()
matchlist_rt = list()
matchlist_h = list()
for i in range(len(tweet_list)):
    match_amp = re.findall('amp', tweet_list[i])
    match_rt = re.findall('rt', tweet_list[i])
    match_h = re.findall(r'\b#', tweet_list[i])
    if len(match_amp) != 0:
        matchlist_amp.append(match_amp)
    if len(match_rt) != 0:
        matchlist_rt.append(match_rt)
    if len(match_h) != 0:
        matchlist_h.append(match_h)
        
count_amp = 0
count_rt = 0
count_h = 0
for list_amp in matchlist_amp:
    count_amp += len(list_amp) 
for list_rt in matchlist_rt:
    count_rt += len(list_rt)
for list_h in matchlist_h:
    count_h += len(list_h) 
print("Coun of 'amp': ", count_amp)
print("Coun of 'rt' : ", count_rt)
print("Coun of '#'  : ", count_h)

Coun of 'amp':  2189
Coun of 'rt' :  0
Coun of '#'  :  1505


In [18]:
matchlist_amp = list()
matchlist_rt = list()
matchlist_h = list()
for i in range(len(tweet_list)):
    match_amp = re.findall('amp', tweet_list2[i])
    match_rt = re.findall('rt', tweet_list2[i])
    match_h = re.findall('#', tweet_list2[i])
    if len(match_amp) != 0:
        matchlist_amp.append(match_amp)
    if len(match_rt) != 0:
        matchlist_rt.append(match_rt)
    if len(match_h) != 0:
        matchlist_h.append(match_h)
        
count_amp = 0
count_rt = 0
count_h = 0
for list_amp in matchlist_amp:
    count_amp += len(list_amp) 
for list_rt in matchlist_rt:
    count_rt += len(list_rt)
for list_h in matchlist_h:
    count_h += len(list_h) 
print("Coun of 'amp': ", count_amp)
print("Coun of 'rt' : ", count_rt)
print("Coun of '#'  : ", count_h)

Coun of 'amp':  267
Coun of 'rt' :  1
Coun of '#'  :  0


In [19]:
for i in range(len(tweet_list)):
    # 3.7 Remove ‘#’ symbols from the tweet while retaining the term
    tweet_list[i] = re.sub('#', '', tweet_list[i])
    # 3.6 Remove redundant terms like ‘amp’, ‘rt’, etc
    tweet_list[i] = re.sub('amp',"", tweet_list[i])
    tweet_list[i] = re.sub('rt','', tweet_list[i])

In [20]:
tweet_list[815]

' jake is right, 2 many ppl lack the will power to take a stand &; drown the small minded &; oft evil tendencies of past '

In [21]:
for i in range(len(tweet_list2)):
    # 3.7 Remove ‘#’ symbols from the tweet while retaining the term
    tweet_list2[i] = re.sub('#', '', tweet_list2[i])
    # 3.6 Remove redundant terms like ‘amp’, ‘rt’, etc
    tweet_list2[i] = re.sub('amp',"", tweet_list2[i])
    tweet_list2[i] = re.sub('rt','', tweet_list2[i])

In [22]:
matchlist_amp = list()
matchlist_rt = list()
matchlist_h = list()
for i in range(len(tweet_list)):
    match_amp = re.findall('amp', tweet_list[i])
    match_rt = re.findall('rt', tweet_list[i])
    match_h = re.findall(r'\b#', tweet_list[i])
    if len(match_amp) != 0:
        matchlist_amp.append(match_amp)
    if len(match_rt) != 0:
        matchlist_rt.append(match_rt)
    if len(match_h) != 0:
        matchlist_h.append(match_h)
        
count_amp = 0
count_rt = 0
count_h = 0
for list_amp in matchlist_amp:
    count_amp += len(list_amp) 
for list_rt in matchlist_rt:
    count_rt += len(list_rt)
for list_h in matchlist_h:
    count_h += len(list_h) 
print("Coun of 'amp': ", count_amp)
print("Coun of 'rt' : ", count_rt)
print("Coun of '#'  : ", count_h)

Coun of 'amp':  0
Coun of 'rt' :  0
Coun of '#'  :  0


In [23]:
matchlist_amp = list()
matchlist_rt = list()
matchlist_h = list()
for i in range(len(tweet_list)):
    match_amp = re.findall('amp', tweet_list2[i])
    match_rt = re.findall('rt', tweet_list2[i])
    match_h = re.findall(r'\b#', tweet_list2[i])
    if len(match_amp) != 0:
        matchlist_amp.append(match_amp)
    if len(match_rt) != 0:
        matchlist_rt.append(match_rt)
    if len(match_h) != 0:
        matchlist_h.append(match_h)
        
count_amp = 0
count_rt = 0
count_h = 0
for list_amp in matchlist_amp:
    count_amp += len(list_amp) 
for list_rt in matchlist_rt:
    count_rt += len(list_rt)
for list_h in matchlist_h:
    count_h += len(list_h) 
print("Coun of 'amp': ", count_amp)
print("Coun of 'rt' : ", count_rt)
print("Coun of '#'  : ", count_h)

Coun of 'amp':  0
Coun of 'rt' :  0
Coun of '#'  :  0


#### 4. Extra cleanup by removing terms with a length of 1

In [24]:
tweet_list2

['',
 'father',
 'dysfunct',
 'selfish',
 'drag',
 'kid',
 'dysfunct',
 'run',
 'thank',
 'lyft',
 'credit',
 'use',
 'caus',
 'offer',
 'wheelchair',
 'van',
 'pdx',
 'disapoint',
 'getthank',
 'bihday',
 'yo',
 'majesti',
 'model',
 'love',
 'u',
 'take',
 'u',
 'time',
 'factsguid',
 'societi',
 'motiv',
 '2',
 '2',
 'huge',
 'fan',
 'fare',
 'big',
 'talk',
 'leav',
 'chao',
 'pay',
 'disput',
 'get',
 'allshowandnogo',
 'c',
 'tomorrow',
 'dannyâ',
 'next',
 'school',
 'year',
 'year',
 'exam',
 'think',
 'school',
 'exam',
 'hate',
 'imagin',
 'actorslif',
 'revolutionschool',
 'girlw',
 'love',
 'land',
 'allin',
 'cav',
 'chion',
 'cleveland',
 'clevelandcavali',
 'â',
 'welcom',
 'gr8',
 'â',
 'ireland',
 'consum',
 'price',
 'index',
 'mom',
 'climb',
 'previou',
 '0',
 '2',
 '0',
 '5',
 'may',
 'blog',
 'silver',
 'gold',
 'forexw',
 'selfish',
 'orlando',
 'standwithorlando',
 'pulseshoot',
 'orlandoshoot',
 'biggerproblem',
 'selfish',
 'heabreak',
 'valu',
 'love',
 'get'

In [25]:
for i in range(len(tweet_list2)):
    #Tweets_List_Tokens[i] = Tweets_List_Tokens[i].replace('#',' ')
    if len(tweet_list2) <= 1 :
        tweet_list2[i] = ''
        #Word = Tweets_List_Tokens[i]

len(tweet_list2)

262781

In [26]:
for i in range(len(tweet_list)):
    #Tweets_List_Tokens[i] = Tweets_List_Tokens[i].replace('#',' ')
    if len(tweet_list) <= 1 :
        tweet_list[i] = ''
        #Word = Tweets_List_Tokens[i]

len(tweet_list)

31962

In [27]:
count = 0
for i in range(len(tweet_list2)):
    if tweet_list2[i] == '' :
        count = count+1
        
print(count)

1779


In [28]:
def remove_empty(the_list, val):
    return [value for value in the_list if value != val]

In [29]:
tweet_list2 = remove_empty(tweet_list2, '')
len(tweet_list2)

261002

In [30]:
count = 0
for i in range(len(tweet_list2)):
    if tweet_list2[i] == '' :
        count = count+1
        
print(count)

0


### 5. Check out the top terms in the tweets

#### 5.1 First, get all the tokenized terms into one large list.

In [31]:
tweet_tokenize_list = list()
for i in range(len(tweet_list2)):
    words = tt.tokenize(tweet_list2[i])
    tweet_tokenize_list.append(words)
    
len(tweet_tokenize_list)

261002

In [32]:
import collections

tweet_tokenize_list_count = collections.Counter(tweet_list2)
[(l,k) for k,l in sorted([(j,i) for i,j in tweet_tokenize_list_count.items()], reverse=True)][0:10]

[('â', 4593),
 ('love', 3118),
 ('day', 2788),
 ('happi', 1997),
 ('yo', 1795),
 ('thank', 1554),
 ('time', 1249),
 ('get', 1245),
 ('u', 1172),
 ('go', 1140)]

#### 5.2 Use the counter and find the 10 most common terms.

In [33]:
tweet_tokenize_list

[['father'],
 ['dysfunct'],
 ['selfish'],
 ['drag'],
 ['kid'],
 ['dysfunct'],
 ['run'],
 ['thank'],
 ['lyft'],
 ['credit'],
 ['use'],
 ['caus'],
 ['offer'],
 ['wheelchair'],
 ['van'],
 ['pdx'],
 ['disapoint'],
 ['getthank'],
 ['bihday'],
 ['yo'],
 ['majesti'],
 ['model'],
 ['love'],
 ['u'],
 ['take'],
 ['u'],
 ['time'],
 ['factsguid'],
 ['societi'],
 ['motiv'],
 ['2'],
 ['2'],
 ['huge'],
 ['fan'],
 ['fare'],
 ['big'],
 ['talk'],
 ['leav'],
 ['chao'],
 ['pay'],
 ['disput'],
 ['get'],
 ['allshowandnogo'],
 ['c'],
 ['tomorrow'],
 ['dannyâ'],
 ['next'],
 ['school'],
 ['year'],
 ['year'],
 ['exam'],
 ['think'],
 ['school'],
 ['exam'],
 ['hate'],
 ['imagin'],
 ['actorslif'],
 ['revolutionschool'],
 ['girlw'],
 ['love'],
 ['land'],
 ['allin'],
 ['cav'],
 ['chion'],
 ['cleveland'],
 ['clevelandcavali'],
 ['â'],
 ['welcom'],
 ['gr8'],
 ['â'],
 ['ireland'],
 ['consum'],
 ['price'],
 ['index'],
 ['mom'],
 ['climb'],
 ['previou'],
 ['0'],
 ['2'],
 ['0'],
 ['5'],
 ['may'],
 ['blog'],
 ['silver'],
 

In [34]:
tweet_list

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   run',
 "  thanks for lyft credit i can't use cause they don't offer wheelchair vans in pdx.    disapointed getthanked",
 '  bihday yo majesty',
 'model   i love u take with u all the time in \x9f\x93±!!! \x9f\x98\x99\x9f\x98\x8e\x9f\x91\x84\x9f\x91\x85\x9f\x92¦\x9f\x92¦\x9f\x92¦  ',
 ' factsguide: society now    motivation',
 '[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. allshowandnogo  ',
 '  cing tomorrow        dannyâ\x80¦',
 "the next school year is the year for exams.\x9f\x98¯ can't think about that \x9f\x98\xad school exams   hate imagine actorslife revolutionschool girl",
 'we won!!! love the land!!! allin cavs chions cleveland clevelandcavaliers  â\x80¦ ',
 "   welcome here !  i'm   it's so gr8 ! ",
 ' â\x86\x9d ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   blog silver gold forex',
 'we are so selfi

In [35]:
def clean(txt):
    txt = str(txt).lower()
    txt = re.sub("([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)","", txt)
    txt = re.sub(r'^https?:\/\/.*[\r\n]*',"", txt)
    txt = re.sub('<.*?>+', '', txt)
    txt = re.sub('urð*', '', txt)
    txt = re.sub('ð', '', txt)
    txt = re.sub('[%s]' % re.escape(string.punctuation), '', txt)
    txt = re.sub('\n', '', txt)
    txt = re.sub(']w*\d\w', '', txt)
    txt = re.sub('#', '', txt)
    txt = re.sub('amp',"", txt)
    txt = re.sub('rt','', txt)
    txt = "".join([word for word in txt if word not in string.punctuation])
    return txt
df_tweets['tweet'] = df_tweets['tweet'].apply(clean)

In [36]:
df_tweets.head()

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for lyft credit i cant use cause they...
2,0,bihday yo majesty
3,0,model i love u take with u all the time in ...
4,0,factsguide society now motivation


### 6 Data formatting for predictive modeling:

#### 6.1 Join the tokens back to form strings. This will be required for the vectorizers.

#### 6.2 Assign x and y.

In [37]:
X = df_tweets.tweet
y = df_tweets.label

print(X.shape)
print(y.shape)

(31962,)
(31962,)


#### 6.3 Perform train_test_split using sklearn.

In [38]:
# split X and y into training ans testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22373,)
(9589,)
(22373,)
(9589,)


### 7 We’ll use TF-IDF values for the terms as a feature to get into a vector space model.

#### 7.1 Import TF-IDF  vectorizer from sklearn.

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### 7.2 Instantiate with a maximum of 5000 terms in your vocabulary.

In [40]:
tfidf_vect = TfidfVectorizer(analyzer='word')

#### 7.3 Fit and apply on the train set.

In [41]:
tfidf_vect.fit(X_train)
X_train_tfidf_dtm = tfidf_vect.transform(X_train)

In [42]:
# create a dataframe
feature_names = tfidf_vect.get_feature_names()
pd.DataFrame(X_train_tfidf_dtm.toarray(), columns=feature_names)

Unnamed: 0,0000001,00027,001,0035,00h30,01,0115,0161,019,01926889917,...,ë³,ë¹,ì¹,îµï½,ï¼,ï½,ï¾,ó¾,øª,ø¹ù
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 7.4 Apply on the test set.

In [43]:
X_test_tfidf_dtm = tfidf_vect.transform(X_test)

In [44]:
# create a dataframe
feature_names_test = tfidf_vect.get_feature_names()
pd.DataFrame(X_test_tfidf_dtm.toarray(), columns=feature_names_test)

Unnamed: 0,0000001,00027,001,0035,00h30,01,0115,0161,019,01926889917,...,ë³,ë¹,ì¹,îµï½,ï¼,ï½,ï¾,ó¾,øª,ø¹ù
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 8 Model building: Ordinary Logistic Regression

#### 8.1 Instantiate Logistic Regression from sklearn with default parameters.

In [45]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)

#### 8.2 Fit into  the train data.

In [46]:
lr.fit(X_train_tfidf_dtm, y_train)

#### 8.3 Make predictions for the train and the test set.

In [47]:
# make class predictions for X_test_dtm
y_pred_class_train = lr.predict(X_train_tfidf_dtm)
y_pred_class_test = lr.predict(X_test_tfidf_dtm)

### 9 Model evaluation: Accuracy, recall, and f_1 score.

#### 9.1 Report the accuracy on the train set.

In [48]:
# Calculate accuracy of class predictions
from sklearn.metrics import accuracy_score
accuracy_score_train = accuracy_score(y_train, y_pred_class_train)
accuracy_score_train

0.9502972332722478

#### 9.2 Report the recall on the train set: decent, high, or low.

In [49]:
from sklearn.metrics import recall_score
recall_metric = recall_score(y_train, y_pred_class_train, average = "macro")
recall_metric

0.6463449274319676

#### 9.3 Get the f1 score on the train set.

In [50]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_train, y_pred_class_train, average = "macro")
f1_score

0.7128226736822645

In [51]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,y_pred_class_train))

[[20803     7]
 [ 1105   458]]


F1 score of our model is 0.71 and as per confusion matrix there are 1105 FN and 7 FP 

### 10 Looks like you need to adjust the class imbalance, as the model seems to focus on the 0s.

#### 10.1 Adjust the appropriate class in the LogisticRegression model.

In [52]:
wlr = LogisticRegression(random_state=1, class_weight="balanced")

### 11 Train again with the adjustment and evaluate.

#### 11.1 Train the model on the train set.

In [53]:
wlr.fit(X_train_tfidf_dtm, y_train)

wlr_y_pred_class_train = wlr.predict(X_train_tfidf_dtm)

#### 11.2 Evaluate the predictions on the train set: accuracy, recall, and f_1 score.

In [54]:
wlr_accuracy_score_train = accuracy_score(y_train, wlr_y_pred_class_train)
wlr_accuracy_score_train

0.9702319760425513

In [55]:
wlr_recall_metric = recall_score(y_train, wlr_y_pred_class_train, average = "macro")
wlr_recall_metric

0.980743499898389

In [56]:
from sklearn.metrics import f1_score
wlr_f1_score = f1_score(y_train, wlr_y_pred_class_train, average = "macro")
wlr_f1_score

0.9035443792160929

In [57]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,wlr_y_pred_class_train))

[[20155   655]
 [   11  1552]]


By changing class_weight from none to balanced, F1 score has improved to 0.90 from 0.71 and FN has reduced to 11 from 1105 and FP has increased to 655 from 7 

### 12 Regularization and Hyperparameter tuning:

#### 12.1 Import GridSearch and StratifiedKFold because of class imbalance.

In [58]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV,StratifiedKFold

#### 12.2 Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.

In [59]:
grid={"C": [1, 2], "penalty":["l2"]}

#### 12.3 Use a balanced class weight while instantiating the logistic regression.

In [60]:
logreg=LogisticRegression(class_weight="balanced")
logreg_cv=GridSearchCV(logreg,grid)
logreg_cv.fit(X_train_tfidf_dtm,y_train)
cv_y_pred_class_train = logreg_cv.predict(X_train_tfidf_dtm)

In [61]:
wlr_accuracy_score_train = accuracy_score(y_train, cv_y_pred_class_train)
wlr_accuracy_score_train

0.9795288964376704

In [62]:
wlr_recall_metric = recall_score(y_train, cv_y_pred_class_train, average = "macro")
wlr_recall_metric

0.9881080629883203

In [63]:
from sklearn.metrics import f1_score
wlr_f1_score = f1_score(y_train, cv_y_pred_class_train, average = "macro")
wlr_f1_score

0.9304351912418964

In [64]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,wlr_y_pred_class_train))

[[20155   655]
 [   11  1552]]


In [65]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,cv_y_pred_class_train))

[[20355   455]
 [    3  1560]]



By applying gridsearch, F1 score has improved to 0.93 from 0.90 and FN has reduced to 3 from 11 and FP has reduced to 455 from 655


### 13 Find the parameters with the best recall in cross-validation.

#### 13.1 Choose ‘recall’ as the metric for scoring.

In [66]:
logreg_KF=GridSearchCV(logreg,grid)

#### 13.2 Choose a stratified 4 fold cross-validation scheme.

In [67]:
from numpy import array
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
kf.get_n_splits(df_tweets)

4

#### 13.4 Fit into  the train set.

In [68]:
count = 0
for train_index, test_index in kf.split(df_tweets):
    KF_X_train, KF_X_test, KF_y_train, KF_y_test = df_tweets.tweet[train_index], df_tweets.tweet[test_index],\
    df_tweets.label[train_index],df_tweets.label[test_index]
    KF_X_train_tfidf_dtm = tfidf_vect.transform(KF_X_train)
    KF_X_test_tfidf_dtm = tfidf_vect.transform(KF_X_test)
    count=count+1
    logreg_KF.fit(KF_X_train_tfidf_dtm,KF_y_train)
    KF_cv_y_pred_class_train = logreg_KF.predict(KF_X_train_tfidf_dtm)
    KF_recall_metric = recall_score(KF_y_train, KF_cv_y_pred_class_train, average = "macro")
    print("Recall score for fold" + str(count) + " is " + str(KF_recall_metric)) 

Recall score for fold1 is 0.9858009942400575
Recall score for fold2 is 0.984689103054245
Recall score for fold3 is 0.9840042979330617
Recall score for fold4 is 0.9855909268297254


Recall score is pretty consistent across all 4 data distribution 

### 14 What are the best parameters?

In [69]:
print(" Results from Grid Search " )
print("\n The best estimator across all parameters:\n",logreg_KF.best_estimator_)
print("\n The best score across all parameters:\n",logreg_KF.best_score_)
print("\n The best parameters across all parameters:\n",logreg_KF.best_params_) 

 Results from Grid Search 

 The best estimator across all parameters:
 LogisticRegression(C=2, class_weight='balanced')

 The best score across all parameters:
 0.9458950730470788

 The best parameters across all parameters:
 {'C': 2, 'penalty': 'l2'}


### 15 Predict and evaluate using the best estimator.

#### 15.1 Use the best estimator from the grid search to make predictions on the test set.

In [70]:
logreg_be=LogisticRegression(C=2, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

logreg_be_cv=GridSearchCV(logreg_be,grid)
logreg_be_cv.fit(X_test_tfidf_dtm,y_test)


best_cv_y_pred_class_test = logreg_be_cv.predict(X_test_tfidf_dtm)

#### 15.2 What is the recall on the test set for the toxic comments?

In [71]:
best_recall = recall_score(y_test, be_cv_y_pred_class_test, average = "macro")
best_recall

0.9859639596752998

#### 15.3 What is the f_1 score?

In [72]:
from sklearn.metrics import f1_score
f1_score_final = f1_score(y_test, be_cv_y_pred_class_test, average = "macro")
f1_score_final

0.9185775279911131

In [73]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, be_cv_y_pred_class_test))

[[8673  237]
 [   1  678]]


Applying best parameters on test dataset we get F1 score of 0.92. 