## Classify tweets as positive or negative using LR
* Label 1 - Toxic tweets


In [36]:
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
import matplotlib.pyplot as plt
import random

import re # Cleaning tweets

from nltk.tokenize import TweetTokenizer # Tweet tokenizer
from nltk.corpus import stopwords  # Removing stop words
import string  # To remove punctuations

from nltk.stem import PorterStemmer   # Stemming

import math # Naive Bayes

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashant.singh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [3]:
# Distribution of positive and negative tweets
df_train['label'].value_counts()

label
0    29720
1     2242
Name: count, dtype: int64

In [4]:
# Positive and negative split
df_train_neg = df_train[df_train['label']==1]
df_train_pos = df_train[df_train['label']==0]

In [5]:
df_train_neg.head(2)

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...


## Preprocess tweets

In [6]:
tweet = df_train_neg.iloc[1,2]

In [7]:
tweet

'no comment!  in #australia   #opkillingbay #seashepherd #helpcovedolphins #thecove  #helpcovedolphins'

In [8]:
tweet2 = re.sub(r'^RT[\s]+', '', tweet)
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2) # Removing hyperlinks
tweet2 = re.sub(r'#', '', tweet2) # Remove hashtags

In [9]:
tweet2

'no comment!  in australia   opkillingbay seashepherd helpcovedolphins thecove  helpcovedolphins'

In [10]:
# Initiate tweet tokenizer
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet2)
print(tweet_tokens)

['no', 'comment', '!', 'in', 'australia', 'opkillingbay', 'seashepherd', 'helpcovedolphins', 'thecove', 'helpcovedolphins']


In [11]:
# After tokenizing, remove stop words
stopwords_english = stopwords.words('english') 
print('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [12]:
tweets_clean = []
for word in tweet_tokens:
    if word not in stopwords_english and word not in string.punctuation:
        tweets_clean.append(word)
tweets_clean

['comment',
 'australia',
 'opkillingbay',
 'seashepherd',
 'helpcovedolphins',
 'thecove',
 'helpcovedolphins']

In [13]:
# Instantiate stemming class
stemmer = PorterStemmer() 

# Create an empty list to store the stems
tweets_stem = [] 

for word in tweets_clean:
    stem_word = stemmer.stem(word)  # stemming word
    tweets_stem.append(stem_word)  # append to the list

print('stemmed words:')
print(tweets_stem)

stemmed words:
['comment', 'australia', 'opkillingbay', 'seashepherd', 'helpcovedolphin', 'thecov', 'helpcovedolphin']


## Do all the steps mentioned above across all the rows

In [14]:
def preprocessing(df):
    try:        
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : re.sub(r'^RT[\s]+', '', x))
        # Remove Hyperlinks
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : re.sub(r'https?:\/\/.*[\r\n]*', '', x))
        # Remove Hashsign "#"
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : re.sub(r'#', '', x))
        # Tokenize the String
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : tokenizer.tokenize(x))
        
        # Remove stop words and punctuations
        stopwords_english = stopwords.words('english') 
        def remove_Stop_Punc(listt):
            clean = []
            for word in listt: # Go through every word in your tokens list
                if (word not in stopwords_english and  # remove stopwords
                    word not in string.punctuation):  # remove punctuation
                    clean.append(word)
            return clean
        
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : remove_Stop_Punc(x))
        
        # Stemming
        
        # Instantiate stemming class
        stemmer = PorterStemmer()
        def stemming_func(listt):
            stemmed = [] 

            for word in listt:
                stem_word = stemmer.stem(word)  # stemming word
                stemmed.append(stem_word)  # append to the list
            return stemmed
    
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : stemming_func(x))
        
        
    except:
        print("Already Preprocessed")


## Preprocess train and test data

In [15]:
preprocessing(df_train)
print("------------------------------------------------------------")
preprocessing(df_test)

------------------------------------------------------------


In [16]:
df_train.head(3)

Unnamed: 0,id,label,tweet
0,1,0,"[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0,"[thank, lyft, credit, can't, use, caus, offer,..."
2,3,0,"[bihday, majesti]"


In [31]:
X = df_train.drop("label",axis = 1)
y = df_train["label"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(  X, y, test_size=0.2, random_state=42)

## Naive Bayes Implementation

In [28]:
y_train[y_train==0].shape

(23783,)

In [32]:
X_train

Unnamed: 0,id,tweet
12110,12111,"[i'v, pay, attent, past, year, /8, year, he', ..."
14081,14082,"[raft, build, salford, quay, pa, gmw, fun, out..."
1829,1830,"[friday, ð, , , ð, , , », gdegblog, friday..."
2769,2770,"[fashion, true, fact]"
31818,31819,"[share, simpl, eleg, businesscard, design, gra..."
...,...,...
29802,29803,"[waltdisneyreso, asham, knew, allig, beach, sign]"
5390,5391,"[invit, catch, stop, talk, much, love, job, ð,..."
860,861,"[black, professor, make, assumpt, entir, race,..."
15795,15796,"[lgbtqhatetrumppay, total, liber, trash, pathe..."


In [33]:
Xm = []
V = set([pair[0] for pair in wordFreq.keys()])
NPos = NNeg = 0
for index , row in X_train.iterrows():
     for word in row["tweet"]:
        if (word,0) in wordFreq:
            NPos += wordFreq[(word,0)]
        if (word,1) in wordFreq:
            NNeg += wordFreq[(word,1)]
          
            
len(V) # Vocabulary

36728

In [40]:
# Missing probabilities are mitigated using add-1 method

likelihoodMatrix= {}
for index , row in X_train.iterrows():
    ProbWPos = 0
    ProbWNeg = 0
    for word in row["tweet"]:
        if (word,0) in wordFreq:
            ProbWPos = (wordFreq[(word,0)]+1)/(NPos + len(V))
        else:
            ProbWPos = (0+1)/(NPos + len(V))
        if (word,1) in wordFreq:
            ProbWNeg = (wordFreq[(word,1)]+1)/(NNeg + len(V))
            
        else:
            ProbWNeg = (0+1)/(NNeg + len(V))
        LogLikelihood = math.log(ProbWPos / ProbWNeg)
        likelihoodMatrix[word] = LogLikelihood
likelihoodMatrix


{"i'v": -0.7787502298674361,
 'pay': -1.359732855188563,
 'attent': -2.0830710080276544,
 'past': -1.4870875759213569,
 'year': -1.1827806174999402,
 '/8': -4.280295585363874,
 "he'": -1.6210355484310957,
 'â': -1.226106680764387,
 '\x80': -1.8030571502160522,
 '¦': -1.6843970974974916,
 'raft': -2.4885361161358186,
 'build': -1.6829109521491832,
 'salford': -2.8940012242439828,
 'quay': -2.200854043684038,
 'pa': -1.4749170403011014,
 'gmw': -2.8940012242439828,
 'fun': 1.3686786527973325,
 'outdoor': -0.5914161312499372,
 'badg': -2.33438543630856,
 'guid': -2.200854043684038,
 'friday': 2.6800521437374343,
 'ð': 1.0707296151792063,
 '\x9f': 1.1152655742238828,
 '\x92': 1.3839267245924947,
 '\x8f': 1.2373319073474582,
 '»': 1.4334372201454955,
 'gdegblog': -2.4885361161358186,
 'selfi': 0.7534049816633782,
 'beard': -0.5426259670805051,
 'gayguy': -2.4885361161358186,
 'gaymen': -2.8940012242439828,
 'gaystyl': -2.4885361161358186,
 'lifestyl': 0.21951408496639158,
 'fullnessâ': -2.8

# Training Naive Bayes

In [45]:
ProbPos = y_train[y_train==0].shape[0] / y_train.shape[0]
ProbNeg =  y_train[y_train==1].shape[0] / y_train.shape[0]
logPrior = math.log(ProbPos/ProbNeg) 

In [46]:
YTrainPred = []
for tweet in X_train["tweet"]:
    p = 0
    for word in tweet: 
        p += (logPrior + likelihoodMatrix[word]) #adding probabilities, cuz log
    if p > 0:
        YTrainPred.append(0)
    else: 
        YTrainPred.append(1)

## Testing Naive Bayes

In [47]:
error = np.mean(np.absolute(YTrainPred-y_train))
# Accuracy is 1 minus the error
accuracy = 1-error
accuracy

0.9614376784387344