## Classify tweets as positive or negative using LR
* Label 1 - Toxic tweets


In [67]:
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
import matplotlib.pyplot as plt
import random

import re # Cleaning tweets

from nltk.tokenize import TweetTokenizer # Tweet tokenizer
from nltk.corpus import stopwords  # Removing stop words
import string  # To remove punctuations

from nltk.stem import PorterStemmer   # Stemming

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashant.singh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [13]:
# Distribution of positive and negative tweets
df_train['label'].value_counts()

label
0    29720
1     2242
Name: count, dtype: int64

In [16]:
# Positive and negative split
df_train_neg = df_train[df_train['label']==1]
df_train_pos = df_train[df_train['label']==0]

In [29]:
df_train_neg.head(2)

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...


## Preprocess tweets

In [48]:
tweet = df_train_neg.iloc[1,2]

In [49]:
tweet

'no comment!  in #australia   #opkillingbay #seashepherd #helpcovedolphins #thecove  #helpcovedolphins'

In [50]:
tweet2 = re.sub(r'^RT[\s]+', '', tweet)
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2) # Removing hyperlinks
tweet2 = re.sub(r'#', '', tweet2) # Remove hashtags

In [51]:
tweet2

'no comment!  in australia   opkillingbay seashepherd helpcovedolphins thecove  helpcovedolphins'

In [53]:
# Initiate tweet tokenizer
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet2)
print(tweet_tokens)

['no', 'comment', '!', 'in', 'australia', 'opkillingbay', 'seashepherd', 'helpcovedolphins', 'thecove', 'helpcovedolphins']


In [64]:
# After tokenizing, remove stop words
stopwords_english = stopwords.words('english') 
print('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [66]:
tweets_clean = []
for word in tweet_tokens:
    if word not in stopwords_english and word not in string.punctuation:
        tweets_clean.append(word)
tweets_clean

['comment',
 'australia',
 'opkillingbay',
 'seashepherd',
 'helpcovedolphins',
 'thecove',
 'helpcovedolphins']

In [68]:
# Instantiate stemming class
stemmer = PorterStemmer() 

# Create an empty list to store the stems
tweets_stem = [] 

for word in tweets_clean:
    stem_word = stemmer.stem(word)  # stemming word
    tweets_stem.append(stem_word)  # append to the list

print('stemmed words:')
print(tweets_stem)

stemmed words:
['comment', 'australia', 'opkillingbay', 'seashepherd', 'helpcovedolphin', 'thecov', 'helpcovedolphin']


## Do all the steps mentioned above across all the rows

In [74]:
def preprocessing(df):
    try:        
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : re.sub(r'^RT[\s]+', '', x))
        # Remove Hyperlinks
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : re.sub(r'https?:\/\/.*[\r\n]*', '', x))
        # Remove Hashsign "#"
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : re.sub(r'#', '', x))
        # Tokenize the String
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : tokenizer.tokenize(x))
        
        # Remove stop words and punctuations
        stopwords_english = stopwords.words('english') 
        def remove_Stop_Punc(listt):
            clean = []
            for word in listt: # Go through every word in your tokens list
                if (word not in stopwords_english and  # remove stopwords
                    word not in string.punctuation):  # remove punctuation
                    clean.append(word)
            return clean
        
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : remove_Stop_Punc(x))
        
        # Stemming
        
        # Instantiate stemming class
        stemmer = PorterStemmer()
        def stemming_func(listt):
            stemmed = [] 

            for word in listt:
                stem_word = stemmer.stem(word)  # stemming word
                stemmed.append(stem_word)  # append to the list
            return stemmed
    
        df["tweet"] = df.loc[:,"tweet"].apply(lambda x : stemming_func(x))
        
        
    except:
        print("Already Preprocessed")


## Preprocess train and test data

In [75]:
preprocessing(df_train)
print("------------------------------------------------------------")
preprocessing(df_test)

------------------------------------------------------------


In [77]:
df_train.head(3)

Unnamed: 0,id,label,tweet
0,1,0,"[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0,"[thank, lyft, credit, can't, use, caus, offer,..."
2,3,0,"[bihday, majesti]"


In [83]:
wordFreq = {}
for index,row in df_train.iterrows():
    # Index is index, row is the whole data frame
    for word in row["tweet"]:
        pair = (word , row["label"])
        if pair in wordFreq:
            wordFreq[pair]+=1
        else:
            wordFreq[pair]=1

## Creating vector representations of tweets using word freq. [1,postive freq, negative freq]

In [81]:
Xm = []
for index , row in df_train.iterrows():
    tweetFeatureList = []
    posFreq = 0
    negFreq = 0
    for word in row["tweet"]:
        if (word,0) in wordFreq:
            posFreq += wordFreq[(word,0)]
        if (word,1) in wordFreq:
            negFreq += wordFreq[(word,1)]
    tweetFeatureList.append(1)
    tweetFeatureList.append(posFreq)
    tweetFeatureList.append(negFreq)
    Xm.append(tweetFeatureList)

In [84]:
Xm[:10] # 1 - bias term, positive word frequencies, negative word freq

[[1, 974, 26],
 [1, 2675, 124],
 [1, 861, 0],
 [1, 302970, 5145],
 [1, 253, 8],
 [1, 2225, 134],
 [1, 11488, 1747],
 [1, 84102, 1113],
 [1, 18428, 2195],
 [1, 1053, 43]]

## Converting Xm into pandas dataframe

In [85]:
data = pd.DataFrame(Xm,columns = ["Bias","PosFreq","NegFreq"])
data.head()

Unnamed: 0,Bias,PosFreq,NegFreq
0,1,974,26
1,1,2675,124
2,1,861,0
3,1,302970,5145
4,1,253,8


In [86]:
df_train.reset_index(drop=True,inplace=True)

In [88]:
data["Sentiment"] = df_train["label"]

## Data to be fed in LR model

In [89]:
data

Unnamed: 0,Bias,PosFreq,NegFreq,Sentiment
0,1,974,26,0
1,1,2675,124,0
2,1,861,0,0
3,1,302970,5145,0
4,1,253,8,0
...,...,...,...,...
31957,1,387168,3943,0
31958,1,2267,195,0
31959,1,2515,113,0
31960,1,97,196,1


### Split data into features and label

In [90]:
X = data.drop("Sentiment",axis = 1)
y = data["Sentiment"]

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(  X, y, test_size=0.2, random_state=42)

## LR Implementation

In [94]:
y_train = y_train.to_numpy(dtype='float64')

In [95]:
y_train = np.reshape(y_train,(y_train.shape[0],1))

In [96]:
m = X_train.shape[0]
alpha = 1e-8
theta = np.zeros((3, 1))
for i in range(0, 1500):
    z = np.dot(X_train,theta)
    Ypred = 1/(1 + np.exp(-z))
    cost = -1/m * (np.dot(y_train.T, np.log(Ypred)) + np.dot((1-y_train).T,np.log(1-Ypred)))
    theta = theta - (alpha/m) * np.dot(X_train.T,(Ypred-y_train))
theta

array([[-3.02680629e-06],
       [-2.28481321e-04],
       [ 4.22149773e-04]])

In [97]:
y_test = y_test.to_numpy(dtype='float64')
y_test = np.reshape(y_test,(y_test.shape[0],1))

In [98]:
ypredicted = []
ztest =np.dot(X_test,theta)
Ytest = 1/(1 + np.exp(-ztest))
for i in Ytest:
    if i > 0.5:
        ypredicted.append(1.0)
    else :
        ypredicted.append(0.0)
accuracy = (ypredicted==np.squeeze(y_test)).sum()/len(X_test)

In [99]:
accuracy

0.9335210386360081