###IMPORTS

In [1]:
import ntpath
from os import walk, path
from pathlib import Path
import os
import sys
from tqdm import tqdm



# PREPROCESSING related imports

from random import shuffle
import re    
import string  
import pandas as pd 
import numpy as np 

import nltk
from nltk.corpus import twitter_samples                          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer   
from nltk.corpus import stopwords          

from sklearn.model_selection import train_test_split



###GETTING DATA

In [2]:
dir = '.'
data_p = "/content/"
train_p = data_p + 'Dataset/train/pos/' 
train_n = data_p + 'Dataset/train/neg/'

def download_data():
  !gdown --id 1pwbVIT5yyUbQZTKp6Fy3gir9eOsUj3nB --output "{dir}/data.zip"

def unzip_data():
  f = dir+"/data.zip"
  !mkdir "$data_p"
  !unzip "$f" -d "$data_p"


###HELPER METHODS

In [3]:
def getF(directory, x="both"):
    # x could be "files" or "folders" or "both"

    folders = []
    for f in walk(directory):
        folders.extend(f)

    if x == "folders":

      return folders[1]
      N
      
        
    if x == "files":
        return folders[2]
    else:
      return folders[1], folders[2]



# read all text files

def read_Alltxt(read_path):
  files = getF(read_path,'files')
  txtFiles = []
  for f in files:
    with open(read_path+f, 'r', encoding="utf-8") as of:
      data = ''
      for line in of:
        data += line
      txtFiles.append(data)
  return txtFiles
    



def BOW(f_as_ftokens, labels):
    bow = {}

    for tokens, label in list(zip(f_as_ftokens, labels)):
        for token in tokens:
            bow[(token, label)] = bow.get((token, label), 0) + 1
        
    return bow



# feature vectors
def extract_features(f_as_ftokens, bow):
    # feature array
    features = np.zeros((1,3))
    # bias term added in the 0th index
    features[0,0] = 1
    
    # iterate processed_tweet
    for word in f_as_ftokens:
        # get the positive frequency of the word
        features[0,1] = bow.get((word, 1), 0)
        # get the negative frequency of the word
        features[0,2] = bow.get((word, 0), 0)
    
    return features

###PREPROCESSOR

In [4]:
class Preprocessor():
    def __init__(self):
      self.tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                      reduce_len=True)
      nltk.download('stopwords')
      self.stopwords_en = stopwords.words('english') 
      self.punctuation_en = string.punctuation
      self.stemmer = PorterStemmer() 

    def __clean_n_tokenize_text__(self, txt):

        # remove hyperlinks
        txt = re.sub(r'https?:\/\/.*[\r\n]*', '', txt)
        # remove hashtags
        txt = re.sub(r'#', '', txt)
        #remove email address
        txt = re.sub('\S+@\S+', '', txt)
        # remove numbers
        txt = re.sub(r'\d+', '', txt)
        txt =  self.tokenizer.tokenize(txt)
        return txt


    def __filter_stopwords__(self, tokens):
        # remove stopwords
        filtered_tokens = []

        for word in tokens:
            if (word not in self.stopwords_en and  # remove stopwords
                word not in self.punctuation_en):  # remove punctuation
                filtered_tokens.append(word)
        return filtered_tokens

    def __stem_tokens__(self, tokens):
        # store the stemmed word
        stemmed_tokens = [] 

        for word in tokens:
            stem_word = self.stemmer.stem(word)  
            stemmed_tokens.append(stem_word)
        return stemmed_tokens



    def preprocess(self, txts):
        final_tokens = []
        for _, txt in tqdm(enumerate(txts)):        
            txt = self.__clean_n_tokenize_text__(txt)                       
            txt = self.__filter_stopwords__(txt)
            txt = self.__stem_tokens__(txt)
            final_tokens.extend([txt])
        return final_tokens

###MODEL

In [17]:
class Model():
  @staticmethod
  def sigmoid(z): 
      return 1 / (1+ np.exp(-z))


  def gradientDescent(self, x, y, theta, alpha, num_iters, c):
      # total samples
      m = x.shape[0]
      
      for i in range(0, num_iters):
          z = np.dot(x, theta)
          h = Model.sigmoid(z)
          cost = (-1/m) * ((np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1-h))) + (c * np.sum(theta)))
          
          theta = theta - (alpha / m) * np.dot((x.T), (h - y))
    
      cost = float(cost)
      return cost, theta


  def predict(self, x, theta):
      return Model.sigmoid(np.dot(x, theta))

###TRAINING

#####Get Data

In [None]:
download_data()
unzip_data()

#####READ TEXT FILES

In [7]:
data_p = read_Alltxt(train_p)

data_n = read_Alltxt(train_n)

#####Preprocess

In [8]:
processor = Preprocessor()

# process the positive and negative tweets
pfs_as_ftokens = processor.preprocess(data_p)
nfs_as_ftokens = processor.preprocess(data_n)

19it [00:00, 186.64it/s]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


12500it [00:49, 251.72it/s]
12500it [00:48, 259.43it/s]


In [9]:
pfs_as_ftokens[0:5]

[['upon',
  'first',
  'view',
  'found',
  'tale',
  'least',
  'less',
  'annoy',
  'cannon',
  'movi',
  'tale',
  'mani',
  'think',
  'one',
  'best',
  'song',
  'pretti',
  'bad',
  'especi',
  'love',
  'song',
  'two',
  'thing',
  'stand',
  'make',
  'movi',
  'even',
  'sing',
  'worthwhil',
  'one',
  'art',
  'direct',
  'like',
  'cannon',
  'movi',
  'tale',
  'beauti',
  'decor',
  'period',
  'piec',
  'everi',
  'piec',
  'cloth',
  'jewel',
  'major',
  'part',
  "movie'",
  'plot',
  'look',
  'fresh',
  'new',
  'contrast',
  'plain',
  'cloth',
  'peasant',
  'even',
  'love',
  'song',
  'find',
  'studi',
  'dress',
  'hair',
  'princess',
  'wonder',
  'done',
  'thing',
  'comic',
  'time',
  'lot',
  'movi',
  'cheesi',
  "emperor'",
  'vaniti',
  'make',
  'fun',
  'end',
  'suspici',
  'guard',
  'guard',
  'chase',
  'nichola',
  'stupid',
  'princ',
  'quit',
  'funni',
  'seem',
  'ridicul',
  'quit',
  'purpos',
  'sequenc',
  'song',
  'weave-o',
  'm

In [10]:
# Labels

labels = [1 for i in range(len(pfs_as_ftokens))]
labels.extend([0 for i in range(len(nfs_as_ftokens))])
pos_neg_fs_as_ftoken = pfs_as_ftokens + nfs_as_ftokens

In [11]:
bow = BOW(pos_neg_fs_as_ftoken, labels)

In [12]:
df = pd.DataFrame(list(zip(pos_neg_fs_as_ftoken, labels)), columns=["f_as_tokens", "labels"])

In [13]:
df.head(5)

Unnamed: 0,f_as_tokens,labels
0,"[upon, first, view, found, tale, least, less, ...",1
1,"[came, back, montreal, premier, zero, day, ......",1
2,"[know, like, film, part, other', mention, bit,...",1
3,"[first, like, movi, caus, nazi, swastika, dram...",1
4,"[seen, origin, incred, journey, sinc, child, c...",1


In [14]:
train_Xt, test_Xt, train_Y, test_Y = train_test_split(df["f_as_tokens"], df["labels"], test_size = 0.2, shuffle=True)


In [15]:
train_X = np.zeros((len(train_Xt), 3))

for index, f_as_token in enumerate(train_Xt):
    train_X[index, :] = extract_features(f_as_token, bow)

# test X feature dimension
test_X = np.zeros((len(test_Xt), 3))

for index, f_as_token in enumerate(test_Xt):
    test_X[index, :] = extract_features(f_as_token, bow)

#####Train

In [18]:
np.random.seed(1)

model = Model()
# 0.1 as L2 
cost, theta = model.gradientDescent(train_X, np.array(train_Y).reshape(-1,1), np.zeros((3, 1)), 1e-7, 1000, 0.1)
print(f"Total Cost {cost:.8f}.")
print(f"Weight Vector {[round(v, 7) for v in np.squeeze(theta)]}")

Total Cost 1.02032667.
Weight Vector [-1.8e-06, 0.000505, -0.000633]


###TEST

In [19]:


predicted_probs = model.predict(test_X, theta)

predicted_labels = np.where(predicted_probs > 0.5, 1, 0)

# accuracy
print(f"accuracy is {len(predicted_labels[predicted_labels == np.array(test_Y).reshape(-1,1)]) / len(test_Y)*100:.2f}")

accuracy is 60.72
